# Project 4: Sentiment Analysis of Text

This project involves building and evaluating deep learning models (RNNs or Transformers) for sentiment classification of text, such as movie reviews or product feedback.

## Preliminaries

In [5]:
!pip install wandb --quiet

In [7]:
# Core Python & Data Handling
import os
import pandas as pd
from collections import defaultdict

# PyTorch Core
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

# Optimizer
import torch.optim as optim

# For Reproducibility (optional but recommended)
import random
import numpy as np

# for progress bars
from tqdm import tqdm

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cpu


# Download datasets

In [9]:
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi

In [10]:
# Setup
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
api = KaggleApi()
api.authenticate()

In [51]:
def show_data(path: str, label: str, text: str, delimiter= ","):
    df = pd.read_csv(path,delimiter=delimiter)

    df = df.rename(columns={text: "text", label: "label"})

    if not df['label'].isin([0, 1, -1]).all():
        df['label'] = df['label'].map({'positive': 1, 'negative': 0})

    num_labels = df['label'].nunique()
    print(f"Number of distinct labels: {num_labels}")
    print(df.head())

    print(f"Number of rows: {df.shape[0]}")

    return df

## IMDb

In [33]:
dataset = 'lakshmi25npathi/imdb-dataset-of-50k-movie-reviews'
api.dataset_download_files(dataset, path= 'IMDB', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews


In [52]:
path = os.path.join("IMDB", "IMDB Dataset.csv")
imdb = show_data(path, "sentiment", "review")

Number of distinct labels: 2
                                                text  label
0  One of the other reviewers has mentioned that ...      1
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      1
3  Basically there's a family where a little boy ...      0
4  Petter Mattei's "Love in the Time of Money" is...      1
Number of rows: 50000


## SST-2

In [24]:
dataset = 'jkhanbk1/sst2-dataset'
api.dataset_download_files(dataset, path='.', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/jkhanbk1/sst2-dataset


In [53]:
path = os.path.join("Finalv SST-2 dataset CSV format", "test.csv")
sst_2 = show_data(path, "label", "sentence")

Number of distinct labels: 2
   label                                               text
0      0        No movement, no yuks, not much of anything.
1      0  A gob of drivel so sickly sweet, even the eage...
2      0  Gangs of New York is an unapologetic mess, who...
3      0  We never really feel involved with the story, ...
4      1              This is one of Polanski's best films.
Number of rows: 1821


## SemEval

In [26]:
dataset = 'azzouza2018/semevaldatadets'
api.dataset_download_files(dataset, path='semEval', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/azzouza2018/semevaldatadets


In [54]:
path = os.path.join("semEval", "semeval-2013-dev.csv")
sem_eval= show_data(path, "label", "text", delimiter='\t')

Number of distinct labels: 3
   label                                               text
0      0  Watching Devil Inside for the 1st time tonight...
1      0  @CMPunk Devil Inside , The exorcisism of Emily...
2      0  Off to do my vlog. Watching Devil Inside and J...
3      1  @raykipo take Silver at the Hib cup. Great day...
4      0  @hollyhippo I'm going to blockbuster tomorrow ...
Number of rows: 1650


In [59]:
frames = [imdb, sst_2, sem_eval]
df = pd.concat(frames)
print(df.shape[0])
df.info()

53471
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53471 entries, 0 to 1649
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    53471 non-null  object
 1   label   53471 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.2+ MB


# Text Preprocessing and Tokenization

Tokenize

In [63]:
import re
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aksinia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aksinia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Create vocabulary

In [83]:
from collections import Counter

MAX_VOCAB_SIZE = 10000
MAX_SEQ_LEN = 250

counter = Counter()
for tokens in df['tokenized_sents']:
    counter.update(tokens)
    
vocab = {word: i+2 for i, (word, _) in enumerate(counter.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

Convert Text to Sequences of Indices

In [84]:
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

df['input_ids'] = df['tokenized_sents'].apply(lambda x: tokens_to_indices(x, vocab))

In [85]:
print(df.head())

                                                text  label  \
0  One of the other reviewers has mentioned that ...      2   
1  A wonderful little production. <br /><br />The...      2   
2  I thought this was a wonderful way to spend ti...      2   
3  Basically there's a family where a little boy ...      1   
4  Petter Mattei's "Love in the Time of Money" is...      2   

                                     tokenized_sents  \
0  [One, of, the, other, reviewers, has, mentione...   
1  [A, wonderful, little, production, ., <, br, /...   
2  [I, thought, this, was, a, wonderful, way, to,...   
3  [Basically, there, 's, a, family, where, a, li...   
4  [Petter, Mattei, 's, ``, Love, in, the, Time, ...   

                                           input_ids  label_transformed  
0  [297, 7, 2, 100, 2083, 55, 1095, 17, 143, 176,...                  2  
1  [133, 433, 144, 389, 4, 12, 13, 10, 11, 12, 13...                  2  
2  [15, 214, 19, 20, 6, 433, 115, 8, 1152, 72, 33...          

Add pad sequence

In [89]:
def pad(seq):
    if len(seq) < MAX_SEQ_LEN:
        return seq + [vocab['<PAD>']] * (MAX_SEQ_LEN - len(seq))
    else:
        return seq[:MAX_SEQ_LEN]

df['input_ids'] = df['input_ids'].apply(pad)

In [90]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
df['label'] = label_enc.fit_transform(df['label'])

# Data Loader

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor([x for x in X], dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [93]:
X_train, X_val, y_train, y_val = train_test_split(df['input_ids'].tolist(), df['label'].tolist(), test_size=0.2)

In [94]:
train_dataset = SentimentDataset(X_train, y_train)
val_dataset = SentimentDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers = 4, pin_memory  = True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers = 2)

# LSTM Model

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.dropout(hidden[-1])
        return self.fc(out)