# Machine Learning Final Project: Sentiment Analysis (Fall 2024)

**Group Members**

* Amit Sarvate (100794129)
* Nirujan Velvarathan (100706828)


**Overview**

* We aim to classify movie reviews into positive or negative sentiments using a large, popular dataset containing 50,000 instances. 
* To achieve this, we will experiment with three different network architectures: 
    * a Feedforward Neural Network with pre-trained embeddings, 
    * a Convolutional Neural Network (CNN), 
    * and a Gated Recurrent Unit (GRU). 
* The goal is to compare their performance on sentiment classification and identify the most effective model. 
* Additionally, we will develop an application where users can input a review and receive a sentiment prediction.

### Importing External Libraries

In order to preprocess data as well as build, train and test our models - we will require various different essential ML libraries including pandas, sklearn, torch, and keras

---

## 1. Feed-forward Neural Network (FNN) 

In [1]:
from importlib import reload

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import models 
import torch.optim as optim

In [4]:
### LOADING DATASET AND PREPROCESSING 
df_FNN = pd.read_csv("data/IMDB Dataset.csv")

vectorizer = CountVectorizer(max_features=5000)
X_FNN = vectorizer.fit_transform(df_FNN['review']).toarray()

label_encoder = LabelEncoder()
y_FNN = label_encoder.fit_transform(df_FNN['sentiment'])

X_train_FNN, X_test_FNN, y_train_FNN, y_test_FNN = train_test_split(X_FNN, y_FNN, test_size=0.2, random_state=42)

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        # self.data = data.clone().detach().float()
        # self.labels = labels.clone().detach().long()


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index] 

In [8]:
dataset_tr_FNN = SentimentDataset(X_train_FNN, y_train_FNN)
dataset_te_FNN = SentimentDataset(X_test_FNN, y_test_FNN)

In [9]:
loader_tr_FNN = DataLoader(dataset_tr_FNN, batch_size=32, shuffle=True)
loader_te_FNN = DataLoader(dataset_te_FNN, batch_size=32, shuffle=False)

In [10]:
input_dim = X_train_FNN.shape[1]
hidden_dim = 500
output_dim = 2 # positive and negative 

reload(models)
model_FNN = models.FeedforwardNeuralNetwork(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()  # Suitable for classification
optimizer = optim.Adam(model_FNN.parameters(), lr=1e-3)

num_epochs = 10
for epoch in range(num_epochs):
    model_FNN.train()
    total_loss = 0
    for data, labels in loader_tr_FNN:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model_FNN(data)
        
        # Compute loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(loader_tr_FNN):.4f}")

Epoch [1/10], Loss: 0.3166
Epoch [2/10], Loss: 0.2154
Epoch [3/10], Loss: 0.1391
Epoch [4/10], Loss: 0.0629
Epoch [5/10], Loss: 0.0248
Epoch [6/10], Loss: 0.0188
Epoch [7/10], Loss: 0.0146
Epoch [8/10], Loss: 0.0122
Epoch [9/10], Loss: 0.0140
Epoch [10/10], Loss: 0.0110


In [11]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)  # Get class with highest score
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

accuracy = evaluate_model(model_FNN, loader_te_FNN)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 87.98%


In [12]:
def predict_sentiment(model, review, vectorizer):
    model.eval()
    with torch.no_grad():
        bow_vector = vectorizer.transform([review]).toarray()
        bow_tensor = torch.tensor(bow_vector, dtype=torch.float32)
        output = model(bow_tensor)
        _, prediction = torch.max(output, 1)
        return label_encoder.inverse_transform([prediction.item()])[0]

new_review = "The movie was not good! I hated it."
print("Sentiment:", predict_sentiment(model_FNN, new_review, vectorizer))

Sentiment: negative


---

## 2. Convolutional Neural Network (CNN)

In [23]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

In [24]:
df_CNN = pd.read_csv("data/IMDB Dataset.csv")

In [25]:
# Tokenize text
max_features = 5000  
max_len = 100  # Maximum sequence length
tokenizer = get_tokenizer("basic_english")  # Use basic English tokenizer

# Build vocabulary from the dataset
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Encode text as sequences of token indices
def encode_text(text):
    tokens = tokenizer(text)
    token_indices = [vocab[token] for token in tokens]
    return token_indices

In [26]:
# Pad sequences to the same length
def pad_sequence_to_max_len(sequences, max_len):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_len:
            seq += [0] * (max_len - len(seq))  # Padding with 0
        else:
            seq = seq[:max_len]  # Truncate if longer than max_len
        padded_sequences.append(seq)
    return torch.tensor(padded_sequences)

In [27]:
# Create a vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(df_CNN['review']),
    specials=["<unk>"],
    max_tokens=max_features  # Limit vocab size to max_features
)

vocab.set_default_index(vocab["<unk>"])  # Handle out-of-vocabulary tokens

# Encode the dataset
X_CNN = [encode_text(review) for review in df_CNN['review']]

X_CNN = pad_sequence_to_max_len(X_CNN, max_len)

# Encode labels
label_mapping = {"positive": 1, "negative": 0}  # Map sentiments to integers
y_CNN = torch.tensor([label_mapping[label] for label in df_CNN['sentiment']])

# Train-test split
X_train_CNN, X_test_CNN, y_train_CNN, y_test_CNN = train_test_split(X_CNN, y_CNN, test_size=0.2, random_state=42)

X_train_CNN = X_train_CNN.clone().detach()
X_test_CNN = X_test_CNN.clone().detach()

In [28]:
class SentimentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data.clone().detach().long()  
        self.labels = labels.clone().detach().long()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index] 

In [29]:
dataset_tr_CNN = SentimentDataset(X_train_CNN, y_train_CNN)
dataset_te_CNN = SentimentDataset(X_test_CNN, y_test_CNN)

In [30]:
loader_tr_CNN = DataLoader(dataset_tr_CNN, batch_size=32, shuffle=True)
loader_te_CNN = DataLoader(dataset_te_CNN, batch_size=32, shuffle=False)

In [31]:
vocab_size = max_features  
embed_dim = 100            
kernel_sizes = [3, 4, 5]   
num_filters = 100          
num_classes = 2

reload(models)
model_CNN = models.ConvolutionalNeuralNetwork(vocab_size, embed_dim, num_classes, kernel_sizes, num_filters)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_CNN.parameters(), lr=1e-3)

In [32]:
num_epochs = 10
for epoch in range(num_epochs):
    model_CNN.train()
    total_loss = 0
    for data, labels in loader_tr_CNN:
        optimizer.zero_grad()
        outputs = model_CNN(data)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(loader_tr_CNN):.4f}")

Epoch [1/10], Loss: 0.5155
Epoch [2/10], Loss: 0.3597
Epoch [3/10], Loss: 0.2380
Epoch [4/10], Loss: 0.1234
Epoch [5/10], Loss: 0.0574
Epoch [6/10], Loss: 0.0267
Epoch [7/10], Loss: 0.0204
Epoch [8/10], Loss: 0.0454
Epoch [9/10], Loss: 0.0266
Epoch [10/10], Loss: 0.0204
