# Load data


In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.ma.core import argmax
from scipy._lib.array_api_compat import torch

from sklearn.model_selection import train_test_split

import re
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers, processors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
STOPWORDS = set(stopwords.words('english'))

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import confusion_matrix, classification_report

In [70]:
df = pd.read_csv('imdb_sup.csv')
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,"Kurt Russell's chameleon-like performance, cou...",10,1
1,It was extremely low budget(it some scenes it ...,8,1
2,James Cagney is best known for his tough chara...,8,1
3,"Following the brilliant ""Goyôkiba"" (aka. ""Hanz...",8,1
4,One of the last classics of the French New Wav...,10,1


In [71]:
df.describe()

Unnamed: 0,Rating,Sentiment
count,50000.0,50000.0
mean,5.49534,0.5
std,3.478721,0.500005
min,1.0,0.0
25%,2.0,0.0
50%,5.5,0.5
75%,9.0,1.0
max,10.0,1.0


In [72]:
df.isna().sum()

Review       0
Rating       0
Sentiment    0
dtype: int64

In [73]:
df.duplicated().sum()

np.int64(414)

In [74]:
df=df.drop_duplicates(keep='first')

In [75]:
texts = df['Review'].tolist()
labels = df['Rating'].tolist()
labels = [rating - 1 for rating in labels]
print(labels)

[9, 7, 7, 7, 9, 9, 9, 6, 8, 6, 6, 9, 9, 9, 6, 7, 9, 9, 7, 6, 7, 9, 8, 7, 7, 8, 8, 7, 9, 9, 8, 6, 6, 9, 7, 8, 8, 7, 6, 9, 7, 7, 8, 8, 9, 9, 8, 7, 9, 9, 6, 8, 9, 9, 7, 7, 9, 7, 7, 9, 8, 9, 8, 9, 6, 6, 9, 6, 8, 6, 6, 9, 9, 7, 7, 9, 7, 9, 9, 7, 7, 7, 8, 7, 7, 6, 9, 7, 6, 7, 6, 9, 9, 9, 8, 7, 9, 6, 8, 6, 6, 6, 7, 9, 7, 7, 6, 6, 9, 8, 6, 9, 7, 9, 8, 7, 9, 9, 7, 6, 7, 7, 8, 6, 9, 7, 9, 9, 6, 9, 7, 8, 8, 9, 8, 9, 9, 8, 6, 9, 8, 8, 6, 9, 9, 6, 8, 9, 7, 7, 6, 6, 7, 7, 7, 9, 7, 6, 6, 6, 9, 8, 7, 9, 6, 9, 6, 6, 9, 8, 7, 6, 8, 6, 9, 7, 9, 9, 9, 6, 7, 6, 7, 8, 6, 7, 8, 9, 8, 9, 8, 8, 8, 6, 9, 6, 7, 9, 9, 6, 9, 7, 7, 9, 9, 6, 7, 6, 6, 9, 7, 8, 9, 6, 9, 7, 7, 8, 6, 7, 8, 9, 9, 6, 9, 9, 6, 9, 7, 9, 9, 9, 8, 9, 7, 8, 7, 9, 6, 9, 6, 7, 9, 7, 6, 9, 7, 6, 9, 7, 6, 6, 9, 8, 9, 7, 7, 6, 6, 9, 7, 7, 9, 9, 9, 7, 7, 9, 7, 7, 6, 9, 8, 9, 7, 9, 8, 9, 6, 7, 9, 9, 7, 7, 9, 9, 6, 6, 9, 9, 9, 7, 8, 6, 6, 8, 8, 6, 7, 7, 6, 8, 9, 6, 7, 7, 9, 9, 6, 6, 9, 9, 6, 8, 9, 8, 9, 8, 7, 9, 6, 8, 8, 9, 9, 8, 9, 9, 6, 6, 7, 8, 9, 

In [76]:
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [77]:
len(x_test) + len(x_train)

49586

# Preprocess data

In [78]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    text = text.strip()
    return text

In [79]:
flatten_data = [''.join(text) for text in x_train]

In [80]:
max_length = 256
vocab_size = 12000

tokenizer = Tokenizer(models.BPE(unk_token='=<UNK>'))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

special_tokens = ['<PAD>', '<UNK>']
tokenizer.normalizer = normalizers.Lowercase()

trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens = special_tokens)
tokenizer.train_from_iterator(flatten_data, trainer)
tokenizer.add_special_tokens(special_tokens)

tokenizer.post_processor = processors.TemplateProcessing(
    single = '$A', # for single sequence
    pair = '$A $B:1', # for pair sequence like for question answering
    special_tokens = [
        ('<PAD>',tokenizer.token_to_id('<PAD>')),
        ('<UNK>',tokenizer.token_to_id('<UNK>'))
    ],
)

tokenizer.enable_truncation(max_length=max_length)
tokenizer.enable_padding(pad_id=tokenizer.token_to_id('<PAD>'),
                         pad_token='<PAD>',
                         length=max_length)

tokenizer_file_name = 'bpe_tokenizer.json'
tokenizer.save(tokenizer_file_name)

tokenizer = Tokenizer.from_file(tokenizer_file_name)






In [81]:
def tokenize_text(text):
    encodings = tokenizer.encode_batch(text)
    return [encode.ids for encode in encodings]

In [82]:
def create_dataloader(sequences, labels, batch_size=64):
    sequences = torch.tensor(sequences)
    labels = torch.tensor(labels, dtype=torch.float)
    dataset = TensorDataset(sequences, labels)

    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [83]:
def preprocess_texts(texts, labels, max_len=None, batch_size=64):
    cleaned_texts = [clean_text(text) for text in texts]
    padded_sequences = tokenize_text(cleaned_texts)
    return create_dataloader(padded_sequences, labels, batch_size)

# Model

In [84]:
cfg = {
    "vocab_size":vocab_size,
    "emb_dim":128,
    "hidden_dim": 512,
    "num_layers":5,
    "bidirectional": True,
    "dropout": 0.15,
    "seq_len":256,
}

In [85]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, cfg):
        super(SentimentAnalysisModel, self).__init__()

        #input : (batch_size , sequence_length) ---> (64,256) in training
        self.embedding = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        #output : (batch_size , sequence_length , embedding_dim) every token get it's embedding

        #input : (batch_size , sequence_length , embedding_dim)
        self.lstm = nn.LSTM(
            input_size=cfg['emb_dim'],
            hidden_size=cfg['hidden_dim'],
            num_layers=cfg['num_layers'],
            batch_first=True,
            bidirectional=cfg['bidirectional'],
            dropout=cfg['dropout']
        )
        #output : (batch_size , sequence_length , hidden_dim*2) because it's bidirectinoal

        # conv1d expects the shape of input as following
        # (batch_size , hidden_dim*2 , sequence_length) so we need to convert the last output to this

        #input (batch_size , sequence_length , hidden_dim*2) -> !(batch_size , hidden_dim*2 , sequence_length)! (used permute in forward)
        self.conv1 = nn.Conv1d(cfg['hidden_dim'] * (2 if cfg['bidirectional'] else 1), 128, kernel_size=3, padding=1)
        #output: (batch_size,out_channels,sequence_length)

        #input: (batch_size,out_channels,sequence_length)
        self.pool = nn.MaxPool1d(kernel_size=2)
        #output: (batch_size,out_channels , sequence_length/2)

        #input: (batch_size,out_channels , sequence_length/2) --> (batch_size,out_channels*sequence_length/2)
        self.fc1 = nn.Linear(128 * (cfg['seq_len'] // 2), 64)
        #output: (batch_size , 64)

        #input: (batch_size,64)
        self.fc2 = nn.Linear(64, 10)
        #output: 10
        self.dropout = nn.Dropout(cfg['dropout'])
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)

        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out.permute(0, 2, 1)

        conv_out = self.conv1(lstm_out)
        pooled_out = self.pool(conv_out)

        flattened = pooled_out.view(pooled_out.size(0), -1)

        x = self.dropout(torch.relu(self.fc1(flattened)))
        logits = self.fc2(x)

        return logits


In [86]:
train_dataloader = preprocess_texts(x_train, y_train)

test_dataloader = preprocess_texts(x_test, y_test)

In [87]:
model = SentimentAnalysisModel(cfg)
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(device)

model = nn.DataParallel(model).to(device)
model.to(device)

mps


DataParallel(
  (module): SentimentAnalysisModel(
    (embedding): Embedding(12000, 128)
    (lstm): LSTM(128, 512, num_layers=5, batch_first=True, dropout=0.15, bidirectional=True)
    (conv1): Conv1d(1024, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (fc1): Linear(in_features=16384, out_features=64, bias=True)
    (fc2): Linear(in_features=64, out_features=10, bias=True)
    (dropout): Dropout(p=0.15, inplace=False)
    (sigmoid): Sigmoid()
  )
)

In [88]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [89]:
def calculate_accuracy(output, targets):
    preds = torch.argmax(output, dim=1)
    correct = preds.eq(targets).sum()
    return correct / targets.size(0)

In [90]:
epochs = 15
for i in range(epochs):
    model.train()
    epoch_train_loss = 0
    epoch_train_acc = 0
    for batch_seq, batch_labels in train_dataloader:
        # print('.', end='')
        batch_seq, batch_labels = batch_seq.to(device), batch_labels.to(device)
        output = model(batch_seq)

        loss = criterion(output, batch_labels)

        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item()
        epoch_train_acc += calculate_accuracy(output, batch_labels).item()
        optimizer.zero_grad()

    train_loss = epoch_train_loss / len(train_dataloader)
    train_acc = epoch_train_acc / len(train_dataloader)

    model.eval()
    epoch_val_loss = 0
    epoch_val_acc = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_seq, batch_labels in test_dataloader:
            batch_seq, batch_labels = batch_seq.to(device), batch_labels.to(device)
            output = model(batch_seq)

            loss = criterion(output, batch_labels)
            epoch_val_loss += loss.item()
            epoch_val_acc += calculate_accuracy(output, batch_labels).item()

            preds = torch.argmax(output, dim=1).cpu().numpy()
            labels = batch_labels.cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

    val_loss = epoch_val_loss / len(test_dataloader)
    val_acc = epoch_val_acc / len(test_dataloader)

    if i == epochs - 1:
        conf_matrix = confusion_matrix(all_labels, all_preds)
        class_report = classification_report(
            all_labels,
            all_preds,
            labels=list(range(10)),
            target_names=[str(i + 1) for i in range(10)]
        )

    print(f"Epoch {i + 1}:")
    print(f"Training - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    print("-" * 50)

print("Final Confusion Matrix:")
print(conf_matrix)
print("Final Classification Report:")
print(class_report)

............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................Epoch 1:
Training - Loss: 1.9497, Accuracy: 0.2724
Validation - Loss: 1.7759, Accuracy: 0.3376
--------------------------------------------------
..........................................................................................................................................................................................................................................

ValueError: Number of classes, 8, does not match size of target_names, 10. Try specifying the labels parameter

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(conf_matrix, class_names):
    plt.figure(figsize=(12, 8))
    sns.set(font_scale=1.2)
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

class_names = [str(i) for i in range(1, 11)]

plot_confusion_matrix(conf_matrix, class_names)

In [None]:
torch.save(model.state_dict(), 'sentiment_analysis_model.pth')
print("Model saved to sentiment_analysis_model.pth")

# Inference

In [None]:
import torch
import torch.nn as nn
from tokenizers import Tokenizer
import re


device = torch.device('mps' if torch.mps.is_available() else 'cpu')

loaded_model = SentimentAnalysisModel(cfg)

state_dict = torch.load('sentiment_analysis_model.pth', map_location=device)

loaded_model.load_state_dict(state_dict)

loaded_model = loaded_model.to(device)

loaded_model.eval()
print(f"Model loaded from sentiment_analysis_model.pth and moved to {device}")

tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.strip()
    return text

def inference(model, text, tokenizer, device, cfg):
    model.eval()
    cleaned_text = clean_text(text)
    encoded = tokenizer.encode(cleaned_text)
    input_ids = torch.tensor([encoded.ids]).to(device)

    with torch.no_grad():
        output = model(input_ids)
        probabilities = torch.softmax(output, dim=1).cpu().numpy()[0]

    predicted_class = torch.argmax(output, dim=1).item()
    rating = predicted_class + 1

    return rating, probabilities

In [None]:
text_to_analyze = "what a very bad movie"
rating, probability = inference(loaded_model, text_to_analyze, tokenizer, device, cfg)
print(f"Predicted Rating: {rating}")
print(f"Probability: {probability:.4f}")

In [None]:
text_to_analyze = "This movie was absolutely incredible, a must-watch!"
rating, probabilities = inference(loaded_model, text_to_analyze, tokenizer, device, cfg)
print(f"Predicted Rating: {rating}")
print(f"Class Probabilities: {['{:.4f}'.format(p) for p in probabilities]}")