<a href="https://colab.research.google.com/github/bhoomika-johnpedely/IMDB-Sentiment-Analysis/blob/main/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMDB Sentiment Analysis

In this project, I am performing a binary sentiment analysis task where the goal is to classify movie reviews as positive or negative based on their content. The dataset is imported from the datasets library. It has 50k total reviews with half classified as positive and the other half as negative. The data is already split into train and test sets. I will preprocess the data using nltk, then extract the features, train and evaluate the model using sklearn.

In [None]:
import torch
torch.cuda.is_available() #Checking if GPU is running

True

In [None]:
#Installing dependencies
import os
import csv
import numpy as np
import pandas as pd

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


### Loading the Data

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.2-py3-none-a

In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(3000))])

In [None]:
#Taking only a subset of the data to work with my current processing power
train_data = small_train_dataset['text']
test_data = small_test_dataset['text']

### Preprocessing the Data

For all the reviews in the train and test dataset, I will be preprocessing them by lowercasing and tokenizing. I wil also be lemmatizing to reduce dimensionality, handling quotes and removing stopwords, punctuations, digits and special characters.

In [None]:
#preprocessing steps for both train and test data
def preprocess_data(data):
    # Lowercasing and tokenization
    data = [word_tokenize(sentence.lower()) for sentence in data]

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    data = [
        [word for word in sentence if word not in stop_words and word not in string.punctuation]
        for sentence in data
    ]

    # Remove digits and special characters
    data = [
        [word for word in sentence if not any(char.isdigit() or char in string.punctuation for char in word)]
        for sentence in data
    ]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data = [
        [lemmatizer.lemmatize(word) for word in sentence]
        for sentence in data
    ]

    # Join tokens back into sentences
    data = [' '.join(sentence) for sentence in data]

    return data

# Preprocess train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Now, train_data and test_data contain preprocessed text data


### Feature Extraction

In [None]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the TF-IDF vectorizer on the training data
tfidf_train_matrix = tfidf_vectorizer.fit_transform(train_data)

# Transform the test data using the same vectorizer
tfidf_test_matrix = tfidf_vectorizer.transform(test_data)

# Now, tfidf_train_matrix contains the TF-IDF representation of the training data,
# and tfidf_test_matrix contains the TF-IDF representation of the test data.

# You can convert the matrices to arrays if needed:
tfidf_train_features = tfidf_train_matrix.toarray()
tfidf_test_features = tfidf_test_matrix.toarray()

# Print the feature names (words) in the TF-IDF vectorizer
print(tfidf_vectorizer.get_feature_names_out())


['aaa' 'aaargh' 'aaaugh' ... 'ángela' 'émigré' 'ísnt']


### Selecting, Training and Evaluating the Model

I am experimenting and evaluating performance on several models, logisitic regression, multinomial naive bayes, linear SVM.

In [None]:
#reassigning train and test sets to new variables for clarity
X_train = tfidf_train_features
y_train = small_train_dataset['label']
X_test = tfidf_test_features
y_test = small_test_dataset['label']

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Linear SVM": SVC(kernel="linear")
}

# Training and evaluation
results = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=["negative", "positive"])
    results[model_name] = {"Accuracy": accuracy, "Report": report}

# Compare the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print("Classification Report:\n", metrics["Report"])
    print("\n")

Model: Logistic Regression
Accuracy: 0.8377
Classification Report:
               precision    recall  f1-score   support

    negative       0.86      0.81      0.83      1511
    positive       0.82      0.86      0.84      1489

    accuracy                           0.84      3000
   macro avg       0.84      0.84      0.84      3000
weighted avg       0.84      0.84      0.84      3000



Model: Multinomial Naive Bayes
Accuracy: 0.8180
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.89      0.83      1511
    positive       0.87      0.74      0.80      1489

    accuracy                           0.82      3000
   macro avg       0.83      0.82      0.82      3000
weighted avg       0.82      0.82      0.82      3000



Model: Linear SVM
Accuracy: 0.8370
Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.82      0.84      1511
    positive       0.83      0.85 

The model that has performed the best is Logistic Regression with accuracy of 83.7%.