# Machine Learning Final Project: Sentiment Analysis (Fall 2024)

**Group Members**

* Amit Sarvate (100794129)
* Nirujan Velvarathan (100706828)


**Overview**

* We aim to classify movie reviews into positive or negative sentiments using a large, popular dataset containing 50,000 instances. 
* To achieve this, we will experiment with three different network architectures: 
    * a Feedforward Neural Network with pre-trained embeddings, 
    * a Convolutional Neural Network (CNN), 
    * and a Gated Recurrent Unit (GRU). 
* The goal is to compare their performance on sentiment classification and identify the most effective model. 
* Additionally, we will develop an application where users can input a review and receive a sentiment prediction.

### Importing External Libraries

In order to preprocess data as well as build, train and test our models - we will require various different essential ML libraries including pandas, sklearn, torch, and keras

---

## 1. Feed-forward Neural Network (FNN) 

In [40]:
import warnings


from importlib import reload
# Suppress specific warnings
warnings.filterwarnings("ignore")

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader # type: ignore
import torch.nn as nn
import torch.nn.functional as F
import models 
import torch.optim as optim

In [18]:
### LOADING DATASET AND PREPROCESSING 
df_FNN = pd.read_csv("data/IMDB Dataset.csv")

vectorizer = CountVectorizer(max_features=5000)
X_FNN = vectorizer.fit_transform(df_FNN['review']).toarray()

label_encoder = LabelEncoder()
y_FNN = label_encoder.fit_transform(df_FNN['sentiment'])

X_train_FNN, X_test_FNN, y_train_FNN, y_test_FNN = train_test_split(X_FNN, y_FNN, test_size=0.2, random_state=42)

In [19]:
class SentimentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
        # self.data = data.clone().detach().float()
        # self.labels = labels.clone().detach().long()


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index] 

In [20]:
dataset_tr_FNN = SentimentDataset(X_train_FNN, y_train_FNN)
dataset_te_FNN = SentimentDataset(X_test_FNN, y_test_FNN)

In [21]:
loader_tr_FNN = DataLoader(dataset_tr_FNN, batch_size=32, shuffle=True)
loader_te_FNN = DataLoader(dataset_te_FNN, batch_size=32, shuffle=False)

In [22]:
input_dim = X_train_FNN.shape[1]
hidden_dim = 500
output_dim = 2 # positive and negative 

reload(models)
model_FNN = models.FeedforwardNeuralNetwork(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()  # Suitable for classification
optimizer = optim.Adam(model_FNN.parameters(), lr=1e-3)

num_epochs = 10
for epoch in range(num_epochs):
    model_FNN.train()
    total_loss = 0
    for data, labels in loader_tr_FNN:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model_FNN(data)
        
        # Compute loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(loader_tr_FNN):.4f}")

Epoch [1/10], Loss: 0.3164
Epoch [2/10], Loss: 0.2144
Epoch [3/10], Loss: 0.1315
Epoch [4/10], Loss: 0.0525
Epoch [5/10], Loss: 0.0261
Epoch [6/10], Loss: 0.0191
Epoch [7/10], Loss: 0.0153
Epoch [8/10], Loss: 0.0145
Epoch [9/10], Loss: 0.0105
Epoch [10/10], Loss: 0.0138


In [23]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)  # Get class with highest score
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

accuracy = evaluate_model(model_FNN, loader_te_FNN)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 88.50%


In [24]:
def predict_sentiment(model, review, vectorizer):
    model.eval()
    with torch.no_grad():
        bow_vector = vectorizer.transform([review]).toarray()
        bow_tensor = torch.tensor(bow_vector, dtype=torch.float32)
        output = model(bow_tensor)
        _, prediction = torch.max(output, 1)
        return label_encoder.inverse_transform([prediction.item()])[0]

new_review = "The movie was not good! I hated it."
print("Sentiment:", predict_sentiment(model_FNN, new_review, vectorizer))

Sentiment: negative


---

## 2. Convolutional Neural Network (CNN)

In [1]:
import torch
import torchtext
print("PyTorch Version:", torch.__version__)
print("TorchText Version:", torchtext.__version__)
print("Is CUDA available?", torch.cuda.is_available())

PyTorch Version: 2.1.0+cpu
TorchText Version: 0.16.0+cpu
Is CUDA available? False


In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

In [7]:
df_CNN = pd.read_csv("data/IMDB Dataset.csv")

In [8]:
# Tokenize text
max_features = 5000  
max_len = 100  # Maximum sequence length
tokenizer = get_tokenizer("basic_english")  # Use basic English tokenizer

# Build vocabulary from the dataset
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Encode text as sequences of token indices
def encode_text(text):
    tokens = tokenizer(text)
    token_indices = [vocab[token] for token in tokens]
    return token_indices

In [9]:
# Pad sequences to the same length
def pad_sequence_to_max_len(sequences, max_len):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_len:
            seq += [0] * (max_len - len(seq))  # Padding with 0
        else:
            seq = seq[:max_len]  # Truncate if longer than max_len
        padded_sequences.append(seq)
    return torch.tensor(padded_sequences)

In [10]:
# Create a vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(df_CNN['review']),
    specials=["<unk>"],
    max_tokens=max_features  # Limit vocab size to max_features
)

vocab.set_default_index(vocab["<unk>"])  # Handle out-of-vocabulary tokens

# Encode the dataset
X_CNN = [encode_text(review) for review in df_CNN['review']]

X_CNN = pad_sequence_to_max_len(X_CNN, max_len)

# Encode labels
label_mapping = {"positive": 1, "negative": 0}  # Map sentiments to integers
y_CNN = torch.tensor([label_mapping[label] for label in df_CNN['sentiment']])

# Train-test split
X_train_CNN, X_test_CNN, y_train_CNN, y_test_CNN = train_test_split(X_CNN, y_CNN, test_size=0.2, random_state=42)

X_train_CNN = X_train_CNN.clone().detach()
X_test_CNN = X_test_CNN.clone().detach()

In [11]:
class SentimentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data.clone().detach().long()  
        self.labels = labels.clone().detach().long()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index] 

In [12]:
dataset_tr_CNN = SentimentDataset(X_train_CNN, y_train_CNN)
dataset_te_CNN = SentimentDataset(X_test_CNN, y_test_CNN)

In [13]:
loader_tr_CNN = DataLoader(dataset_tr_CNN, batch_size=32, shuffle=True)
loader_te_CNN = DataLoader(dataset_te_CNN, batch_size=32, shuffle=False)

In [16]:
vocab_size = max_features  
embed_dim = 100            
kernel_sizes = [3, 4, 5]   
num_filters = 100          
num_classes = 2

reload(models)
model_CNN = models.ConvolutionalNeuralNetwork(vocab_size, embed_dim, num_classes, kernel_sizes, num_filters)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_CNN.parameters(), lr=1e-3)

In [17]:
num_epochs = 10
for epoch in range(num_epochs):
    model_CNN.train()
    total_loss = 0
    for data, labels in loader_tr_CNN:
        optimizer.zero_grad()
        outputs = model_CNN(data)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(loader_tr_CNN):.4f}")

Epoch [1/10], Loss: 0.5213
Epoch [2/10], Loss: 0.3568
Epoch [3/10], Loss: 0.2373
Epoch [4/10], Loss: 0.1236
Epoch [5/10], Loss: 0.0518
Epoch [6/10], Loss: 0.0258
Epoch [7/10], Loss: 0.0234
Epoch [8/10], Loss: 0.0408
Epoch [9/10], Loss: 0.0248
Epoch [10/10], Loss: 0.0163


# 3. Gated Recurrent Network (GRU)

In [27]:
from sklearn.metrics import accuracy_score, classification_report
reload(models)

<module 'models' from 'c:\\Users\\Nirujan\\Documents\\GitHub\\sentiment-analysis-ml\\models.py'>

In [32]:
df_GRU = pd.read_csv("data/IMDB Dataset.csv")

In [28]:
def train_model(model, train_loader, val_loader, epochs, learning_rate, device):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        predictions, targets = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            targets.extend(labels.cpu().numpy())

        train_acc = accuracy_score(targets, predictions)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}, Accuracy: {train_acc:.4f}")
        
        evaluate_model(model, val_loader, device)

In [29]:
def evaluate_model(model, val_loader, device):
    model.eval()
    val_predictions, val_targets = [], []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_targets.extend(labels.cpu().numpy())

    val_acc = accuracy_score(val_targets, val_predictions)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(classification_report(val_targets, val_predictions))

In [36]:
class SentimentDatasetGRU(Dataset):
    def __init__(self, data, labels):
        self.data = data.long()  # Ensure inputs are LongTensor
        self.labels = labels.long()  # Ensure labels are LongTensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

In [38]:
X_GRU = [encode_text(review) for review in df_GRU['review']]
X_GRU = pad_sequence_to_max_len(X_GRU, max_len).long()

labels = torch.tensor([label_mapping[label] for label in df_GRU['sentiment']])

In [37]:
X_train, X_val, y_train, y_val = train_test_split(X_GRU, labels, test_size=0.2, random_state=42)

train_dataset_GRU = SentimentDatasetGRU(X_train, y_train)
val_dataset_GRU = SentimentDatasetGRU(X_val, y_val)

# Create DataLoaders
loader_tr_GRU  = DataLoader(train_dataset_GRU, batch_size=32, shuffle=True)
loader_te_GRU  = DataLoader(val_dataset_GRU, batch_size=32, shuffle=False)

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = 10000  # Replace with your dataset's vocabulary size
embed_dim = 100
hidden_dim = 128
num_classes = 2  # Binary classification for sentiment analysis

model_GRU = models.GRUNeuralNetwork(vocab_size, embed_dim, hidden_dim, num_classes)
train_model(model_GRU, loader_tr_GRU, loader_te_GRU, epochs=10, learning_rate=0.001, device=device)



Epoch 1/10, Loss: 0.6436667266368866, Accuracy: 0.6068
Validation Accuracy: 0.7689
              precision    recall  f1-score   support

           0       0.76      0.77      0.77      4961
           1       0.77      0.76      0.77      5039

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

Epoch 2/10, Loss: 0.41464263858795164, Accuracy: 0.8104
Validation Accuracy: 0.8237
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      4961
           1       0.81      0.84      0.83      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

Epoch 3/10, Loss: 0.3286949941933155, Accuracy: 0.8571
Validation Accuracy: 0.8270
              precision    recall  f1-score   support

           0       0.84      0.80      0.