In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import random
import time
import string
import torch.nn as nn  
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Set seed for reproducibility
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load IMDb dataset
imdb = load_dataset('imdb')
train_test_split = imdb['train'].train_test_split(test_size=0.2, seed=SEED)
train_data, valid_data = train_test_split['train'], train_test_split['test']
test_data = imdb['test']

MAX_VOCAB_SIZE = 25_000
PAD_IDX, UNK_IDX = 0, 1

def tokenize(text):
    # Simple whitespace tokenizer with punctuation removal
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text.split()

def generate_bigrams(tokens):
    return tokens + [f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens)-1)]

def build_vocab(dataset, max_size):
    counter = Counter()
    for example in dataset:
        tokens = tokenize(example['text'])
        bigrams = generate_bigrams(tokens)
        counter.update(bigrams)
    vocab = ['<pad>', '<unk>'] + [word for word, _ in counter.most_common(max_size-2)]
    return {word: idx for idx, word in enumerate(vocab)}

word2idx = build_vocab(train_data, MAX_VOCAB_SIZE)

class IMDBDataset(Dataset):
    def __init__(self, data, word2idx):
        self.data = data
        self.word2idx = word2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = 1.0 if self.data[idx]['label'] == 1 else 0.0
        
        tokens = tokenize(text)
        bigrams = generate_bigrams(tokens)
        indices = [self.word2idx.get(token, UNK_IDX) for token in bigrams]
        
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

def collate_batch(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(text) for text in texts])
    padded_texts = torch.nn.utils.rnn.pad_sequence(texts, padding_value=PAD_IDX)
    return padded_texts, torch.tensor(labels), lengths

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = IMDBDataset(train_data, word2idx)
valid_dataset = IMDBDataset(valid_data, word2idx)
test_dataset = IMDBDataset(test_data, word2idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch)

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text, lengths):
        embedded = self.embedding(text)  # [seq_len, batch_size, emb_dim]
        pooled = embedded.mean(dim=0)    # [batch_size, emb_dim]
        return self.fc(pooled).squeeze(1)

# Model parameters
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = word2idx['<pad>']

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

# Training setup
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for text, labels, lengths in iterator:
        optimizer.zero_grad()
        text, labels = text.to(device), labels.to(device)
        predictions = model(text, lengths)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for text, labels, lengths in iterator:
            text, labels = text.to(device), labels.to(device)
            predictions = model(text, lengths)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Training loop
N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_loader, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'fasttext-model.pt')
    print(f'Epoch: {epoch+1:02} | Time: {int(epoch_mins)}m {int(epoch_secs)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

# Test evaluation
model.load_state_dict(torch.load('fasttext-model.pt'))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

def predict_sentiment(model, sentence):
    model.eval()
    tokens = tokenize(sentence)
    bigrams = generate_bigrams(tokens)
    indexed = [word2idx.get(token, UNK_IDX) for token in bigrams]
    tensor = torch.LongTensor(indexed).unsqueeze(1).to(device)
    length = torch.LongTensor([len(indexed)]).to(device)
    prediction = torch.sigmoid(model(tensor, length))
    return prediction.item()

# Example predictions
print(predict_sentiment(model, "This film is terrible"))  # Negative review
print(predict_sentiment(model, "This film is awesome"))   # Positive review

Epoch: 01 | Time: 0m 10s
	Train Loss: 0.688 | Train Acc: 56.17%
	 Val. Loss: 0.678 |  Val. Acc: 73.52%
Epoch: 02 | Time: 0m 9s
	Train Loss: 0.654 | Train Acc: 74.68%
	 Val. Loss: 0.624 |  Val. Acc: 77.18%
Test Loss: 0.627 | Test Acc: 76.53%
0.004991530440747738
0.5692414045333862


In [None]:
def predict_sentiment(model, sentence, word2idx, device):
    model.eval()
    # Preprocess the input text
    tokens = tokenize(sentence)
    bigrams = generate_bigrams(tokens)
    indexed = [word2idx.get(token, UNK_IDX) for token in bigrams]
    
    # Convert to tensor and add batch dimension
    tensor = torch.LongTensor(indexed).unsqueeze(1).to(device)
    length = torch.LongTensor([len(indexed)]).to(device)
    
    # Make prediction
    with torch.no_grad():
        prediction = torch.sigmoid(model(tensor, length))
    
    # Interpret result
    return "Positive" if prediction.item() > 0.5 else "Negative", prediction.item()

In [7]:
# Example usage
test_sentence = "This was worst movie of the century"
sentiment, confidence = predict_sentiment(model, test_sentence, word2idx, device)
print(f"Sentiment: {sentiment} | Confidence: {confidence:.4f}")

Sentiment: Negative | Confidence: 0.0272


In [19]:
print(predict_sentiment(model, "This film is terrible", word2idx, device))
# Output: ('Negative', 0.127)

print(predict_sentiment(model, " didnt like it  ", word2idx, device))
# Output: ('Positive', 0.923)

('Negative', 0.004991530440747738)
('Negative', 0.003174806712195277)
