# Machine Learning Final Project: Sentiment Analysis (Fall 2024)

**Group Members**

* Amit Sarvate (100794129)
* Nirujan Velvarathan (100706828)


**Overview**

* We aim to classify movie reviews into positive or negative sentiments using a large, popular dataset containing 50,000 instances. 
* To achieve this, we will experiment with three different network architectures: 
    * a Feedforward Neural Network with pre-trained embeddings, 
    * a Convolutional Neural Network (CNN), 
    * and a Gated Recurrent Unit (GRU). 
* The goal is to compare their performance on sentiment classification and identify the most effective model. 
* Additionally, we will develop an application where users can input a review and receive a sentiment prediction.



In [25]:
from importlib import reload
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from gensim import corpora
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import models

In [2]:
def load_dataset():
    PATH_TO_DATASET = "data/IMDB Dataset.csv"

    df = pd.read_csv(PATH_TO_DATASET)

    return df

In [3]:
def pre_process_df(dataframe):
    # tokenize text 
    dataframe['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in dataframe['review']] 

    # stemmed tokens 
    porter_stemmer = PorterStemmer()
    dataframe['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in dataframe['tokenized_text'] ]

    return dataframe

In [10]:
def split_train_test(dataframe, test_size=0.3, shuffle_state=True):
    
    # Specify the features and target columns
    features = ['review', 'tokenized_text', 'stemmed_tokens']
    target = 'sentiment'
    
    # Perform train-test split
    X_train, X_test, Y_train, Y_test = train_test_split(
        dataframe[features],
        dataframe[target],
        shuffle=shuffle_state,
        test_size=test_size,
        random_state=15
    )
    
    # Display value counts for sentiments
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    
    # Reset indices for clean DataFrame manipulation
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    
    # Print debug information
    print(type(X_train))
    print(type(Y_train))
    print(X_train.head())
    
    return X_train, X_test, Y_train, Y_test

In [19]:
def make_dict(dataframe, padding=True):
    if padding:
        print("Dictionary with padded token added")
        review_dict = corpora.Dictionary([['pad']])
        review_dict.add_documents(dataframe['stemmed_tokens'])
    else:
        print("Dictionary without padding")
        review_dict = corpora.Dictionary(dataframe['stemmed_tokens'])
    return review_dict

In [20]:
def make_bow_vector(review_dict, sentence):
    vec = torch.zeros(VOCAB_SIZE, dtype=torch.float64, device=device)
    for word in sentence:
        vec[review_dict.token2id[word]] += 1
    return vec.view(1, -1).float()

In [21]:
def make_target(label):
    if label == -1:
        return torch.tensor([0], dtype=torch.long, device=device)
    elif label == 0:
        return torch.tensor([1], dtype=torch.long, device=device)
    else:
        return torch.tensor([2], dtype=torch.long, device=device)

### Execution of Functions

In [5]:
df = load_dataset()

In [6]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [7]:
df = pre_process_df(df)

In [8]:
X_train, X_test, Y_train, Y_test = split_train_test(df)

Value counts for Train sentiments
sentiment
negative    17512
positive    17488
Name: count, dtype: int64
Value counts for Test sentiments
sentiment
positive    7512
negative    7488
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
                                              review  \
0  The sequel is exactly what you will expect it ...   
1  This is a pretty well known one so i won't get...   
2  I made the mistake of buying this since I coll...   
3  This movie is the proverbial 80s flick that sh...   
4  I absolutely LOVED this movie as a child. I ca...   

                                      tokenized_text  \
0  [the, sequel, is, exactly, what, you, will, ex...   
1  [this, is, pretty, well, known, one, so, won, ...   
2  [made, the, mistake, of, buying, this, since, ...   
3  [this, movie, is, the, proverbial, flick, that...   
4  [absolutely, loved, this, movie, as, child, ca...   

                                      stemm

In [28]:
review_dict = make_dict(df, padding=False)

VOCAB_SIZE = len(review_dict)

input_dim = VOCAB_SIZE
hidden_dim = 500
output_dim = 3
num_epochs = 100

reload(models)
ff_nn_bow_model = models.FeedforwardNeuralNetwork(input_dim, hidden_dim, output_dim)
ff_nn_bow_model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(ff_nn_bow_model.parameters(), lr=0.001)

Dictionary without padding
