In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
import torch.nn.functional as F

!pip install keras
!pip install gensim
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import nltk
import csv
from nltk.corpus import stopwords
nltk.download('stopwords')
import pandas as pd
import numpy as np
import re



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


This cell loads the trainData, skipping any blank lines. The Dataset is converted into a Numpy array which is then
divided into data and labels. 1st column has sentences and the rest of the columns are the labels

In [135]:
trainData2 = pd.read_csv('train.csv',skip_blank_lines = True,encoding='utf8', error_bad_lines=False)
trainData2 = trainData2.to_numpy()
print(trainData2.shape)
trainData1 = trainData2[:,1]
trainLabels1 = trainData2[:,2:]




  trainData2 = pd.read_csv('train.csv',skip_blank_lines = True,encoding='utf8', error_bad_lines=False)


(159571, 8)


This Cell cleans the data by removing all stop words, punctuation, numbers, URLs, and shifts all letters to lowercase

In [136]:
def lowercase(txt):

    return txt.lower()

def remove_punctuation(txt):

    txt= re.sub(r'[^\w\s]', '', txt)
    # list_of_words = txt.split()
    # final_list = [word for word in list_of_words if word not in stop_arr]
    # temp = ' '.join(final_list)

    return txt

def remove_stopwords(txt):

    stop_arr = set(stopwords.words('english'))
    list_of_words = txt.split()
    final_list = [word for word in list_of_words if word not in stop_arr]
    temp = ' '.join(final_list)

    return temp

def remove_numbers(txt):

    txt= re.sub(r'\d', '', txt)
    return txt

def remove_url(txt):

    txt= re.sub(r'http\S+|www\S+|https\S+', '', txt, flags=re.MULTILINE)
    return txt

def normalize_sentence(txt):
    '''
    Aggregates all the above functions to normalize/clean a sentence
    '''
    txt = lowercase(txt)
    txt = remove_punctuation(txt)
    txt = remove_stopwords(txt)
    txt = remove_numbers(txt)
    txt = remove_url(txt)

    return txt
trainData1 = np.array([normalize_sentence(str(sentence)) for sentence in trainData1])

Over here we are using Tokenizer to extract vocabulary from the dataset. Each word in the dataset gets assigned a unique integer.

Moreover, we convert the integers into a sequence that represents the words in the vocab

Furthermore we train a word2vec model to fit to the sequences which is then used to extract an embedding matrix. Each row now represents a vector for a word in the vocab

In [137]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainData1)
sequences = tokenizer.texts_to_sequences(trainData1)
word2vec_model = Word2Vec(sentences=sequences, vector_size=100, window=5, min_count=1, workers=4)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = word2vec_model.wv.vectors
max_sequence_length = 60
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post') #this ensures all the sequences are of the same length.

Converting all the train data and validation data into tensors

In [166]:
X = torch.tensor(padded_sequences[:120000], dtype=torch.long)
y = torch.tensor(trainLabels1[:120000,:].astype('float32'))
X1 = torch.tensor(padded_sequences[120000:], dtype=torch.long)
y1 = torch.tensor(trainLabels1[120000:,:].astype('float32'))

The init function defines an embedding which converts integer encoded words into dense vectors.

LSTM is used to identify the sequential dependencies on the input data

nn.linear outputs the LSTM output into the number of classes

The forward function defines the way the input model is processed through the layers to produce the output. Once again embeddings are used

In [167]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embed = self.embedding(x)
        lstm_out, _ = self.lstm(embed)
        output = self.fc(lstm_out[:,-1,:])
        return output

Instance of the sentiment classifier is created using the following variables:

In [168]:
embed_size = 50     #This represents the dimentionality of the embeddings
hidden_size = 60    #This represents the units in the hidden state of the LSTM layer
num_classes = 6     #Total possible classes from which we can get an output

model = SentimentClassifier(vocab_size, embed_size, hidden_size, num_classes)   

This cell has the definition of the loss function and the optimizer of type adam with a learning rate of 0.0001

DataLoader is used to handle batching, shuffling, and loading the training data efficiently.

In [169]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 200
batch_size = 40

X = X.to(torch.device("cuda"))
y = y.to(torch.device("cuda"))
train_dataset = TensorDataset(X,y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

The outer loop goes on for the number of epochs defined (200)

The inner loop goes for the batches of data that are provided by the dataLoader

Zero grad is done to zero out the gradients for each iteration

The loss function calculates the loss whereas the backward part is used to compute the gradients with respect to the loss values

The optimizer.step() updates the model parameters using the gradients and the optimizer.

In [170]:
model =model.to(torch.device("cuda"))
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = loss_func(output, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/200, Loss: 0.06087476387619972
Epoch 2/200, Loss: 0.20868292450904846
Epoch 3/200, Loss: 0.06210465729236603
Epoch 4/200, Loss: 0.06646235287189484
Epoch 5/200, Loss: 0.05151171237230301
Epoch 6/200, Loss: 0.06692270934581757
Epoch 7/200, Loss: 0.154835507273674
Epoch 8/200, Loss: 0.10982824116945267
Epoch 9/200, Loss: 0.15515229105949402
Epoch 10/200, Loss: 0.01880861259996891
Epoch 11/200, Loss: 0.04806936904788017
Epoch 12/200, Loss: 0.04943617060780525
Epoch 13/200, Loss: 0.04432258382439613
Epoch 14/200, Loss: 0.06823522597551346
Epoch 15/200, Loss: 0.1135869100689888
Epoch 16/200, Loss: 0.05505584552884102
Epoch 17/200, Loss: 0.10799290984869003
Epoch 18/200, Loss: 0.028679411858320236
Epoch 19/200, Loss: 0.030765462666749954
Epoch 20/200, Loss: 0.009288051165640354
Epoch 21/200, Loss: 0.0723707526922226
Epoch 22/200, Loss: 0.018867310136556625
Epoch 23/200, Loss: 0.0847838819026947
Epoch 24/200, Loss: 0.028209496289491653
Epoch 25/200, Loss: 0.017687765881419182
Epoch 26

Here a dataLoader is created on the test dataset using a batch size of 32

The test data is then passed into the model to obtain outputs which are then appended to the lists representing the predictions and the true labels. This heppens for each batch

Lastly, using the 2 lists, we calculate the accuracy of our predictions using the true labels which come out to be around 86.6%

In [180]:
batch_size = 32

dataset = torch.utils.data.TensorDataset(X1, y1)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

predictions = []
true_labels = []

with torch.no_grad():
    model.eval()

    for batch_X, batch_y in data_loader:
        batch_X = batch_X.to(torch.device("cuda"))
        batch_y = batch_y.to(torch.device("cuda"))

        batch_output = model(batch_X)
        predictions.append(batch_output.cpu().numpy())
        true_labels.append(batch_y.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

final_predictions = (predictions > 0.5).astype(int)

acc = accuracy_score(true_labels, final_predictions)
print(f'Test Accuracy: {acc}')

Test Accuracy: 0.8665436809784943
