In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from torch.utils.data import random_split, dataloader
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import random_split
import torch.nn.functional as F
import torch.optim as optim

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = "."

In [None]:
! kaggle competitions download -c sentiment-analysis-on-movie-reviews

Downloading sentiment-analysis-on-movie-reviews.zip to /content
  0% 0.00/1.90M [00:00<?, ?B/s]
100% 1.90M/1.90M [00:00<00:00, 28.0MB/s]


In [None]:
!unzip sentiment-analysis-on-movie-reviews -d data

Archive:  sentiment-analysis-on-movie-reviews.zip
  inflating: data/sampleSubmission.csv  
  inflating: data/test.tsv.zip       
  inflating: data/train.tsv.zip      


In [None]:
train_fname = "train.tsv.zip"
test_fname = "test.tsv.zip"
sample_subm = "sampleSubmission.tsv.zip"

In [None]:
data_dir = "./data/"

In [None]:
raw_df = pd.read_csv(data_dir + train_fname, sep="\t")

In [None]:
raw_df.shape

(156060, 4)

In [None]:
raw_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [None]:
raw_df['Phrase'][:2].values

array(['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'A series of escapades demonstrating the adage that what is good for the goose'],
      dtype=object)

In [None]:
np.arange(0,10,2)

array([0, 2, 4, 6, 8])

In [None]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

phrase = standardize_text(raw_df, "Phrase")

phrase.to_csv("clean_data.csv")
phrase.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,2,1,a series of escapades demonstrating the adage ...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2


In [None]:
clean_raw_df = pd.read_csv("clean_data.csv")
clean_raw_df.tail()

Unnamed: 0.1,Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
156055,156055,156056,8544,hearst 's,2
156056,156056,156057,8544,forced avuncular chortles,1
156057,156057,156058,8544,avuncular chortles,3
156058,156058,156059,8544,avuncular,2
156059,156059,156060,8544,chortles,2


In [None]:
clean_raw_df.drop(labels=['Unnamed: 0','PhraseId','SentenceId'], axis=1, inplace=True)

In [None]:
clean_raw_df.head()

Unnamed: 0,Phrase,Sentiment
0,a series of escapades demonstrating the adage ...,1
1,a series of escapades demonstrating the adage ...,2
2,a series,2
3,a,2
4,series,2


In [None]:
clean_raw_df.groupby("Sentiment").count()

Unnamed: 0_level_0,Phrase
Sentiment,Unnamed: 1_level_1
0,7072
1,27273
2,79582
3,32927
4,9206


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def tokenize_phrase(phrase):
    return word_tokenize(phrase)

In [None]:
clean_raw_df["tokens"] = clean_raw_df["Phrase"].apply(tokenize_phrase)

In [None]:
clean_raw_df.head(10)

Unnamed: 0,Phrase,Sentiment,tokens
0,a series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,a series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
2,a series,2,"[a, series]"
3,a,2,[a]
4,series,2,[series]
5,of escapades demonstrating the adage that what...,2,"[of, escapades, demonstrating, the, adage, tha..."
6,of,2,[of]
7,escapades demonstrating the adage that what is...,2,"[escapades, demonstrating, the, adage, that, w..."
8,escapades,2,[escapades]
9,demonstrating the adage that what is good for ...,2,"[demonstrating, the, adage, that, what, is, go..."


In [None]:
clean_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Phrase     156060 non-null  object
 1   Sentiment  156060 non-null  int64 
 2   tokens     156060 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.6+ MB


In [None]:
all_words = [word for tokens in clean_raw_df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in clean_raw_df["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

1125137 words total, with a vocabulary size of 16538
Max sentence length is 53


In [None]:
# fig = plt.figure(figsize=(10, 10))
# plt.xlabel('Sentence length')
# plt.ylabel('Number of sentences')
# plt.hist(sentence_lengths)
# plt.show()

In [None]:
list_corpus = clean_raw_df["Phrase"].tolist()
list_labels = clean_raw_df["Sentiment"].tolist()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(list_corpus, list_labels, test_size=0.2,
                                                                                random_state=40)

In [None]:
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()

    train = tfidf_vectorizer.fit_transform(data)

    return train, tfidf_vectorizer

X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

In [None]:
X_train_tfidf


<124848x15230 sparse matrix of type '<class 'numpy.float64'>'
	with 777838 stored elements in Compressed Sparse Row format>

In [None]:
%%time
clf_tfidf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg',
                         multi_class='multinomial', n_jobs=-1, random_state=40)
clf_tfidf.fit(X_train_tfidf, y_train)

y_predicted_tfidf = clf_tfidf.predict(X_val_tfidf)

CPU times: user 285 ms, sys: 119 ms, total: 404 ms
Wall time: 29.5 s


In [None]:
def get_metrics(y_val, y_predicted):
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_val, y_predicted)
    return accuracy



In [None]:
accuracy_tfidf = get_metrics(y_val, y_predicted_tfidf)
print("accuracy = %.3f" % (accuracy_tfidf))

accuracy = 0.594


## Neural Network using Pytorch

In [None]:
X_train_tfidf.shape[0]

124848

In [None]:
# Step 2: Preprocess the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(clean_raw_df['Phrase']).toarray()

In [None]:
input_size = X.shape[1]

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(clean_raw_df['Sentiment'])

The above line of code using LabelEncoder is used to transform the categorical labels of the sentiment classes into numerical values that can be processed by the neural network. Let's break down the reason behind this step and how it fits into the overall process.

# Explanation
In many machine learning and deep learning tasks, especially those involving categorical labels, it is necessary to convert these labels into numerical values. Neural networks and other algorithms require numerical input to perform mathematical computations. LabelEncoder from sklearn.preprocessing is a convenient tool to perform this conversion.

# Context in Sentiment Analysis
In the Kaggle "Sentiment Analysis on Movie Reviews" challenge, the sentiment labels are provided as categorical values. Typically, these labels might look something like this:

0: Negative

1: Somewhat Negative

2: Neutral

3: Somewhat Positive

4: Positive

Given this context, the LabelEncoder is used to encode these categorical labels into integers, making them suitable for training a machine learning model.



# Why This Code Was Run
1. Converting Categorical Labels to Numerical Labels: Neural networks require numerical input. The sentiment labels in the dataset are categorical (e.g., 'negative', 'neutral', 'positive'), so they need to be converted to a numerical format.

2. Compatibility with Loss Functions: Many loss functions, including CrossEntropyLoss used in classification tasks, require the target labels to be in a specific numerical format. LabelEncoder ensures the labels are in the correct format.

3. Consistent Mapping: LabelEncoder maps each unique label to a specific integer consistently. For instance, if the sentiment labels are 'negative', 'neutral', 'positive', it might map them to 0, 1, 2, respectively.

In [None]:
y[:100]

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2,
       4, 3, 2, 3, 3, 3, 2, 2, 4, 2, 3, 4, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2])

In [None]:
X_train, X_val_nn, y_train, y_val_nn = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_nn, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_nn, dtype=torch.long)

In [None]:
# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# Step 3: Build and train the neural network model
class SentimentNN(nn.Module):
    def __init__(self):
        super(SentimentNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 5)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

In [None]:
# Initialize the model, loss function, and optimizer
model = SentimentNN()

# Check if GPU is available and move the model to GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
%%time
# Training the model
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')


Epoch [1/10], Loss: 1.1315
Epoch [2/10], Loss: 0.9942
Epoch [3/10], Loss: 0.9244
Epoch [4/10], Loss: 0.8725
Epoch [5/10], Loss: 0.8355
Epoch [6/10], Loss: 0.8039
Epoch [7/10], Loss: 0.7792
Epoch [8/10], Loss: 0.7578
Epoch [9/10], Loss: 0.7374
Epoch [10/10], Loss: 0.7204
CPU times: user 5min 54s, sys: 6.48 s, total: 6min
Wall time: 6min 3s


In [None]:
%%time
# Evaluating the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Validation Accuracy: {100 * correct / total:.2f}%')

Validation Accuracy: 62.72%
CPU times: user 1.53 s, sys: 3.87 ms, total: 1.53 s
Wall time: 1.59 s
