# Machine Learning Final Project: Sentiment Analysis (Fall 2024)

**Group Members**

* Amit Sarvate (100794129)
* Nirujan Velvarathan (100706828)


**Overview**

* We aim to classify movie reviews into positive or negative sentiments using a large, popular dataset containing 50,000 instances. 
* To achieve this, we will experiment with three different network architectures: 
    * a Feedforward Neural Network with pre-trained embeddings, 
    * a Convolutional Neural Network (CNN), 
    * and a Gated Recurrent Unit (GRU). 
* The goal is to compare their performance on sentiment classification and identify the most effective model. 
* Additionally, we will develop an application where users can input a review and receive a sentiment prediction.

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import models
import torch.optim as optim

In [15]:
### LOADING DATASET AND PREPROCESSING 
df = pd.read_csv("data/IMDB Dataset.csv")

vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review']).toarray()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
class SentimentDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.data[index], self.labels[index] 

In [22]:
dataset_tr = SentimentDataset(X_train, y_train)
dataset_te = SentimentDataset(X_test, y_test)

In [23]:
loader_tr = DataLoader(train_dataset, batch_size=32, shuffle=True)
loader_te = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [28]:
input_dim = X_train.shape[1]
hidden_dim = 500
output_dim = 2 # positive and negative 
model = models.FeedforwardNeuralNetwork(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()  # Suitable for classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data, labels in train_loader:
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(data)
        
        # Compute loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

Epoch [1/10], Loss: 0.3184
Epoch [2/10], Loss: 0.2162
Epoch [3/10], Loss: 0.1346
Epoch [4/10], Loss: 0.0608
Epoch [5/10], Loss: 0.0274
Epoch [6/10], Loss: 0.0188
Epoch [7/10], Loss: 0.0157
Epoch [8/10], Loss: 0.0120
Epoch [9/10], Loss: 0.0154
Epoch [10/10], Loss: 0.0103


In [29]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs, 1)  # Get class with highest score
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

accuracy = evaluate_model(model, test_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 88.15%


In [31]:
def predict_sentiment(model, review, vectorizer):
    model.eval()
    with torch.no_grad():
        bow_vector = vectorizer.transform([review]).toarray()
        bow_tensor = torch.tensor(bow_vector, dtype=torch.float32)
        output = model(bow_tensor)
        _, prediction = torch.max(output, 1)
        return label_encoder.inverse_transform([prediction.item()])[0]

new_review = "The movie was not good! I hated it."
print("Sentiment:", predict_sentiment(model, new_review, vectorizer))

Sentiment: negative
