In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import re
from typing import List
from datasets import load_dataset
import re

In [13]:
dataset = load_dataset("imdb")

df = pd.DataFrame(dataset['train'])

df.head()


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [14]:
df["label"].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [15]:
df_test = pd.DataFrame(dataset['test'])

df_test.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [17]:
# clean text 

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    return text.lower()

In [18]:
df["text"] = df["text"].apply(clean_text)


In [19]:
# build vocabulary
from collections import Counter
from nltk.tokenize import word_tokenize


all_tokens = [ word_tokenize(text) for text in df["text"]]

tokens_flat = [token for sublist in all_tokens for token in sublist ]

vocab = Counter(tokens_flat)


vocab = {word: i+2 for i, (word, freq) in enumerate(vocab.items()) if freq > 5}  # remove rare words
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

In [20]:
print(all_tokens[10])

['it', 'was', 'great', 'to', 'see', 'some', 'of', 'my', 'favorite', 'stars', 'of', '30', 'years', 'ago', 'including', 'john', 'ritter', 'ben', 'gazarra', 'and', 'audrey', 'hepburn', 'they', 'looked', 'quite', 'wonderful', 'but', 'that', 'was', 'it', 'they', 'were', 'not', 'given', 'any', 'characters', 'or', 'good', 'lines', 'to', 'work', 'with', 'i', 'neither', 'understood', 'or', 'cared', 'what', 'the', 'characters', 'were', 'doingbr', 'br', 'some', 'of', 'the', 'smaller', 'female', 'roles', 'were', 'fine', 'patty', 'henson', 'and', 'colleen', 'camp', 'were', 'quite', 'competent', 'and', 'confident', 'in', 'their', 'small', 'sidekick', 'parts', 'they', 'showed', 'some', 'talent', 'and', 'it', 'is', 'sad', 'they', 'didnt', 'go', 'on', 'to', 'star', 'in', 'more', 'and', 'better', 'films', 'sadly', 'i', 'didnt', 'think', 'dorothy', 'stratten', 'got', 'a', 'chance', 'to', 'act', 'in', 'this', 'her', 'only', 'important', 'film', 'rolebr', 'br', 'the', 'film', 'appears', 'to', 'have', 'some

In [22]:
print(vocab)



In [46]:
# encode tokens

def encode(text, vocab, max_len=100):
    tokens = word_tokenize(text)
    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens[:max_len]]
    # Ensure all ids are valid indices for embedding
    vocab_size = len(vocab)
    ids = [id if id < vocab_size else vocab["<UNK>"] for id in ids]
    if len(ids) < max_len:
        ids += [vocab['<PAD>']] * (max_len - len(ids))
    return ids


In [47]:
df["text"]

0        i rented i am curiousyellow from my video stor...
1        i am curious yellow is a risible and pretentio...
2        if only to avoid making this type of film in t...
3        this film was probably inspired by godards mas...
4        oh brotherafter hearing about this ridiculous ...
                               ...                        
24995    a hit at the time but now better categorised a...
24996    i love this movie like no other another time i...
24997    this film and its sequel barry mckenzie holds ...
24998    the adventures of barry mckenzie started life ...
24999    the story centers around barry mckenzie who mu...
Name: text, Length: 25000, dtype: object

In [48]:
df['input_ids'] = df['text'].apply(lambda x: encode(x, vocab))


In [49]:
df

Unnamed: 0,text,label,input_ids
0,i rented i am curiousyellow from my video stor...,0,"[2, 3, 2, 4, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,i am curious yellow is a risible and pretentio...,0,"[2, 4, 180, 181, 52, 40, 182, 89, 183, 184, 18..."
2,if only to avoid making this type of film in t...,0,"[31, 206, 34, 295, 73, 36, 296, 11, 155, 22, 1..."
3,this film was probably inspired by godards mas...,0,"[36, 155, 19, 337, 338, 28, 339, 1, 1, 89, 2, ..."
4,oh brotherafter hearing about this ridiculous ...,0,"[385, 1, 387, 67, 36, 388, 155, 48, 389, 112, ..."
...,...,...,...
24995,a hit at the time but now better categorised a...,1,"[40, 398, 26, 13, 331, 176, 414, 329, 1, 86, 1..."
24996,i love this movie like no other another time i...,1,"[2, 620, 36, 382, 125, 173, 576, 1193, 331, 2,..."
24997,this film and its sequel barry mckenzie holds ...,1,"[36, 155, 89, 122, 4991, 11441, 1, 8020, 147, ..."
24998,the adventures of barry mckenzie started life ...,1,"[13, 9063, 11, 11441, 1, 945, 68, 86, 40, 3384..."


In [50]:
class IMBDDataset(Dataset):
    def __init__(self , inputs , labels):
        self.inputs =inputs
        self.labels =  labels 
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
         
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [51]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['input_ids'].tolist(), df['label'].tolist(), test_size=0.2)

train_dataset = IMBDDataset(train_texts, train_labels)
test_dataset = IMBDDataset(test_texts, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [57]:
# model 

class SentimentLSTM(nn.Module):
    def __init__(self , vocab_size , embedding_dim  , hidden_dim, output_dim):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim , padding_idx=0)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    
    def forward(self , x ):
        x =self.embeddings(x)
        x = x.permute(1 , 0 , 2)
        _ , (hidden ,_) =  self.lstm(x)
        x = self.fc(hidden[-1])
        return x
        

In [58]:
model = SentimentLSTM(len(vocab) , embedding_dim=100 , hidden_dim= 128 , output_dim=2 )

In [59]:
# training 

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters() , lr = 0.001)

def train(model ,loader ):
    model.train()
    for batch in loader :
        inputs = batch["input_ids"]
        labels = batch["label"]
        
        outputs = model(inputs)
        loss = criterion(outputs , labels)
        
        optimizer .zero_grad()
        loss.backward()
        optimizer.step()
        

In [60]:
#evaluate 

@torch.no_grad()
def evaluate(model , loader ): 
    model.eval()
    all_preds , all_labels = [],[]
    
    for batch in loader :
        inputs = batch["input_ids"]
        labels = batch["label"]
        
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())
    acc = accuracy_score(all_labels, all_preds)
    return acc


In [61]:

for epoch in range(5):
    train(model, train_loader)
    acc = evaluate(model, test_loader)
    print(f"Epoch {epoch+1} - Test Accuracy: {acc:.4f}")


Epoch 1 - Test Accuracy: 0.6560
Epoch 2 - Test Accuracy: 0.5834
Epoch 3 - Test Accuracy: 0.7902
Epoch 4 - Test Accuracy: 0.8110
Epoch 5 - Test Accuracy: 0.8104


In [62]:
# text examples 

text = "i love this movie"
tokens = encode(text, vocab)

inputs = torch.tensor(tokens).unsqueeze(0)
outputs = model(inputs)
pred = torch.argmax(outputs, dim=1)
print(pred.item())

1


In [69]:
# text examples 

text = " this movie is disgusting"
tokens = encode(text, vocab)

inputs = torch.tensor(tokens).unsqueeze(0)
outputs = model(inputs)
pred = torch.argmax(outputs, dim=1)
print(pred.item())

0


ok, sound good !!


In [70]:
# let's save the model 

torch.save(model.state_dict() , "model.pt")