In [1]:
# ! pip install bs4
# ! pip install contractions
# ! pip install gensim

In [2]:
# pip install nltk

In [3]:
import pandas as pd
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api
import gensim.models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import gc
from sys import getsizeof

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ayanpatel_69/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ayanpatel_69/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/ayanpatel_69/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Task 1: Dataset Generation

In [4]:
df = pd.read_csv('./data.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False)
df = df[['star_rating', 'review_body']]
class_one = df[(df['star_rating']==1) | (df['star_rating']==2)]
class_two = df[df['star_rating']==3]
class_three = df[(df['star_rating']==4) | (df['star_rating']==5)]

class_one.loc[:, "label"] =1
class_two.loc[:, "label"] =2
class_three.loc[:, "label"] =3

class_one = class_one.sample(n=20000, random_state=100)
class_two = class_two.sample(n=20000, random_state=100)
class_three = class_three.sample(n=20000, random_state=100)
# dataset = pd.concat([class_one, class_two, class_three])
df = pd.concat([class_one, class_two, class_three])

df.reset_index(drop=True)
train = df.sample(frac=0.8, random_state=100)
test = df.drop(train.index)

train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)

del globals()['class_one'], globals()['class_two'], globals()['class_three'], globals()['df']
train = test = df = dataset = [[99999, 99999]]
del df, train, test, dataset
gc.collect()

0

In [5]:
# Pretrained Word2Vec model:
pretrained_w2v = api.load('word2vec-google-news-300')

In [4]:
gc.collect()

14

In [6]:
print('Check semantic similarities of the generated vectors:')
print(pretrained_w2v.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1))
print('Excellent ~ Outstanding:', pretrained_w2v.similarity('excellent', 'outstanding'))
print('time ~ schedule:', pretrained_w2v.similarity('time', 'schedule'))

Check semantic similarities of the generated vectors:
[('queen', 0.7118193507194519)]
Excellent ~ Outstanding: 0.5567486
time ~ schedule: 0.26993576


In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [8]:
# Covert all reviews to lower case
train['review_body'] = train['review_body'].str.lower()
test['review_body'] = test['review_body'].str.lower()

'''
URL Remover code
'''
train['review_body'] = train['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
test['review_body'] = test['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

def html_tag_remover(review):
    soup = BeautifulSoup(review, 'html.parser')
    review = soup.get_text()
    return review  

train['review_body'] = train['review_body'].apply(lambda review: html_tag_remover(review))
test['review_body'] = test['review_body'].apply(lambda review: html_tag_remover(review))

'''
remove non-alphabetical characters
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))

'''
remove extra spaces
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub(' +', ' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub(' +', ' ', review))

'''
perform contractions on the reviews
'''
def expand_contractions(review):
    review = contractions.fix(review)
    return review

train['review_body'] = train['review_body'].apply(lambda review: expand_contractions(review))
test['review_body'] = test['review_body'].apply(lambda review: expand_contractions(review))

In [9]:
'''
remove the stop words AND perform lemmatization

'''
avg_len_before_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

def remove_stopwords(review):
    stop_words_english = set(stopwords.words('english'))
    review_word_tokens = word_tokenize(review)
    filtered_review = [word for word in review_word_tokens if not word in stop_words_english]
    return filtered_review

train['review_body'] = train['review_body'].apply(lambda review: remove_stopwords(review))
test['review_body'] = test['review_body'].apply(lambda review: remove_stopwords(review))

def review_lemmatize(review):
    lemmatizer = WordNetLemmatizer()
    lemmatized_review = [lemmatizer.lemmatize(word) for word in review]
    return ' '.join(lemmatized_review)    

train['review_body'] = train['review_body'].apply(lambda review: review_lemmatize(review))
test['review_body'] = test['review_body'].apply(lambda review: review_lemmatize(review))

avg_len_after_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

# Task 2: Word Embedding

In [10]:
all_Sentences = [sentence.split(' ') for sentence in train['review_body'].to_list()]

In [11]:
# Custom Word2Vec
custom_model = gensim.models.Word2Vec(all_Sentences, vector_size = 300, min_count=9, window=13)

In [12]:
print('Check semantic similarities of the generated vectors:')
print(custom_model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1)[0])
print('Excellent ~ Outstanding:', custom_model.wv.similarity('excellent', 'outstanding'))
print('time ~ schedule:', custom_model.wv.similarity('time', 'schedule'))

Check semantic similarities of the generated vectors:
('touted', 0.7953194379806519)
Excellent ~ Outstanding: 0.7759805
time ~ schedule: 0.20508367


In [13]:
del all_Sentences, custom_model
gc.collect()
all_Sentences = [1]
custom_model = [1]

# Task 3: Simple Models

In [17]:
# Average word2Vec vectors
def average_vectors(review, label):
    temp_review = review.split(' ')
    review_vector = np.array([pretrained_w2v[word] for word in temp_review if word in pretrained_w2v])
    if len(review_vector) >=1:
#         review_vector = []
#         for word in words:
#             review_vector.append(pretrained_w2v[word])
        return review_vector, label

def average_vectors_concat(review, label):
    temp_review = review.split(' ')
    review_vector = np.array([pretrained_w2v[word] for word in temp_review[:10] if word in pretrained_w2v])
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(review_vector)==0:
        review_vector = np.zeros((1, 300))
    review_vector = np.concatenate(review_vector, axis=0)
    
    # In the case where the total dim of the feature vector is <3000 add the padding with zeros
    if len(review_vector)<3000:
        review_vector = np.concatenate([review_vector, np.zeros(3000-len(data))])
    return review_vector/10, label

#     review_vector = np.array([pretrained_w2v[word] for word in temp_review[:20] if word in pretrained_w2v])
    
#     # can be the case where the words in the review are not found in the W2V vocabulary
#     if len(review_vector)==0:
#         review_vector = np.zeros((20, 300))
    
#     # In the case where the total dim of the feature vector is <20 add the padding with zeros
#     elif len(review_vector)<20:
#         review_vector = np.concatenate([review_vector, np.zeros((20-len(review_vector), 300))])
        
#     return review_vector, label
    
    
def featurization(dataset, concat = False):
    features = []
    y_labels = []
    concat = concat
    
    for review, label in zip(dataset['review_body'], dataset['label']):
        try:
            if not concat:
                x, y = average_vectors(review, label)
                features.append(np.mean(x, axis=0))
            else:
                x, y = average_vectors_concat(review, label)
                features.append(x)
                
            y_labels.append(y)
        
        except:
            pass
    return features, y_labels

# Vectors without concatenation
w2v_pretrain_train_x, w2v_pretrain_train_y = featurization(train)
w2v_pretrain_test_x, w2v_pretrain_test_y = featurization(test)

# Vectors with concatenation
w2v_pretrain_train_concat_x, w2v_pretrain_train_concat_y = featurization(train, True)
w2v_pretrain_test_concat_x, w2v_pretrain_test_concat_y = featurization(test, True)

In [18]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(min_df = 0.001)
tfidf_X_train = tfidf_vectorizer.fit_transform(list(train['review_body']))
tfidf_X_train = pd.DataFrame(tfidf_X_train.toarray())

tfidf_X_test = tfidf_vectorizer.transform(list(test['review_body']))
tfidf_X_test = pd.DataFrame(tfidf_X_test.toarray())

tfidf_Y_train = train['label']
tfidf_Y_test = test['label']

tfidf_Y_train = tfidf_Y_train.astype('int')
tfidf_Y_test = tfidf_Y_test.astype('int')

In [19]:
# Training Perceptron Model on Average Word2Vec Features
perceptr_w2v = Perceptron(random_state = 100, eta0=0.1)
perceptr_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_test = perceptr_w2v.predict(w2v_pretrain_test_x)

# Training Perceptron Model on TF-IDF Features
perceptr_tfidf = Perceptron(random_state = 100, eta0=0.1)
perceptr_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_test = perceptr_tfidf.predict(tfidf_X_test)

target_names = ['class 1', 'class 2', 'class 3']
report_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_test, target_names=target_names, output_dict=True)
report_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_test, target_names=target_names, output_dict=True)

In [20]:
print('Accuracy values PERCEPTRON for w2v and tfidf features:')
print(report_w2v['accuracy'], report_tfidf['accuracy'])

Accuracy values PERCEPTRON for w2v and tfidf features:
0.5805374728759807 0.6170833333333333


In [21]:
# Training SVM Model on Average Word2Vec Features
svm_w2v = LinearSVC(random_state=100, max_iter=1000)
svm_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_svm_test = svm_w2v.predict(w2v_pretrain_test_x)

# Training SVM Model on TFIDF Features
svm_tfidf = LinearSVC(random_state=100, max_iter=1000)
svm_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_svm_test = svm_tfidf.predict(tfidf_X_test)

report_svm_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_svm_test, target_names=target_names, output_dict=True)
report_svm_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_svm_test, target_names=target_names, output_dict=True)

In [22]:
print('Accuracy values SVM for w2v and tfidf features:')
print(report_svm_w2v['accuracy'], report_svm_tfidf['accuracy'])

Accuracy values SVM for w2v and tfidf features:
0.627691537305959 0.6685


In [23]:
# del globals()['pretrained_w2v']

# Task 4: Feedforward Neural Networks

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset

In [27]:
device = torch.device('cpu')

In [28]:

class MLP(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 300):
        super(MLP, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class MLP_concat(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 3000):
        super(MLP_concat, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = MLP()
model_concat = MLP_concat()
model = model
model_concat = model_concat
print(model)
print(model_concat)

MLP(
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
)
MLP_concat(
  (fc1): Linear(in_features=3000, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
)


-- Task 4(a) using the average Word2Vec vectors

In [29]:
train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_x), torch.LongTensor(w2v_pretrain_train_y))
test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_x), torch.LongTensor(w2v_pretrain_test_y))

# Data Loader
train_batch_size=16
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=16
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# number of epochs to train the model
n_epochs = 20

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == (target-1)).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 1.094978 	Validation Loss: 1.088335 	Epoch Accuracy: 0.429311
Epoch: 2 	Training Loss: 1.051001 	Validation Loss: 1.003671 	Epoch Accuracy: 0.502504
Epoch: 3 	Training Loss: 0.954951 	Validation Loss: 0.933668 	Epoch Accuracy: 0.549407
Epoch: 4 	Training Loss: 0.907306 	Validation Loss: 0.918480 	Epoch Accuracy: 0.562093
Epoch: 5 	Training Loss: 0.883382 	Validation Loss: 0.882331 	Epoch Accuracy: 0.592472
Epoch: 6 	Training Loss: 0.861866 	Validation Loss: 0.866874 	Epoch Accuracy: 0.604740
Epoch: 7 	Training Loss: 0.844556 	Validation Loss: 0.853242 	Epoch Accuracy: 0.615173
Epoch: 8 	Training Loss: 0.832535 	Validation Loss: 0.839693 	Epoch Accuracy: 0.621015
Epoch: 9 	Training Loss: 0.824333 	Validation Loss: 0.840188 	Epoch Accuracy: 0.624019
Epoch: 10 	Training Loss: 0.818082 	Validation Loss: 0.829889 	Epoch Accuracy: 0.629027
Epoch: 11 	Training Loss: 0.813533 	Validation Loss: 0.825328 	Epoch Accuracy: 0.629277
Epoch: 12 	Training Loss: 0.809082 	Valid

-- Test Dataset Accuracy classwise

In [30]:
# test_batch_size= len(w2v_pretrain_test_x)
# test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
        
#         mat = confusion_matrix((target-1),ypred)
#         ans = mat.diagonal()/mat.sum(axis=1)
        
# print("Accuracy Values for each class")

# for i,acc in enumerate(ans):
#     print(f"Class {i+1} : {acc: .6f}")
print(classification_report(main_tar, predss))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67      4085
           1       0.58      0.47      0.52      3979
           2       0.64      0.77      0.70      3918

    accuracy                           0.63     11982
   macro avg       0.63      0.64      0.63     11982
weighted avg       0.63      0.63      0.63     11982



Task 4(b) 10 word vectors concatenated

In [31]:

train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_concat_x), torch.LongTensor(w2v_pretrain_train_concat_y))

test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_concat_x), torch.LongTensor(w2v_pretrain_test_concat_y))

# Data Loader
train_batch_size=16
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=16
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model_concat.parameters(), lr=0.002)

# number of epochs to train the model
n_epochs = 10

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_concat.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_concat.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 0.976190 	Validation Loss: 0.952041 	Epoch Accuracy: 0.537061
Epoch: 2 	Training Loss: 0.898768 	Validation Loss: 0.924447 	Epoch Accuracy: 0.553955
Epoch: 3 	Training Loss: 0.849942 	Validation Loss: 0.932289 	Epoch Accuracy: 0.558015
Epoch: 4 	Training Loss: 0.782588 	Validation Loss: 0.964667 	Epoch Accuracy: 0.541776
Epoch: 5 	Training Loss: 0.667988 	Validation Loss: 1.040065 	Epoch Accuracy: 0.529597
Epoch: 6 	Training Loss: 0.510079 	Validation Loss: 1.256078 	Epoch Accuracy: 0.530251
Epoch: 7 	Training Loss: 0.352025 	Validation Loss: 1.622535 	Epoch Accuracy: 0.524358
Epoch: 8 	Training Loss: 0.227284 	Validation Loss: 1.925074 	Epoch Accuracy: 0.514536
Epoch: 9 	Training Loss: 0.149157 	Validation Loss: 2.293204 	Epoch Accuracy: 0.510084
Epoch: 10 	Training Loss: 0.116174 	Validation Loss: 2.711413 	Epoch Accuracy: 0.498298


In [33]:
# test_batch_size= len(w2v_pretrain_test_concat_x)
# test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model_concat.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
#         mat = confusion_matrix(target-1,ypred)
#         ans = mat.diagonal()/mat.sum(axis=1)
        
print("Accuracy Values for each class")

# for i,acc in enumerate(ans):
#     print(f"Class {i+1} : {acc: .6f}")
print(classification_report(main_tar, predss))

Accuracy Values for each class
              precision    recall  f1-score   support

           0       0.52      0.54      0.53      2600
           1       0.45      0.42      0.43      2602
           2       0.53      0.54      0.53      2434

    accuracy                           0.50      7636
   macro avg       0.50      0.50      0.50      7636
weighted avg       0.50      0.50      0.50      7636



In [35]:
del globals()['tfidf_X_train'], globals()['tfidf_X_test'], globals()['tfidf_Y_train'], globals()['tfidf_Y_test']

del globals()['w2v_pretrain_train_x'], globals()['w2v_pretrain_train_y']
del globals()['w2v_pretrain_test_x'], globals()['w2v_pretrain_test_y']

del globals()['w2v_pretrain_train_concat_x'], globals()['w2v_pretrain_train_concat_y']
del globals()['w2v_pretrain_test_concat_x'], globals()['w2v_pretrain_test_concat_y']

del globals()['model'], globals()['model_concat'], globals()['train_data'], globals()['test_data']
del globals()['Y_pred_w2v_test'], globals()['Y_pred_tfidf_test'], globals()['Y_pred_w2v_svm_test'], globals()['Y_pred_tfidf_svm_test']

del globals()['train_loader'], globals()['test_loader']
del globals()['main_tar'], globals()['predss']



gc.collect()

84

# Task 5 Recurrent Neural Networks

In [34]:
# Pretrained Word2Vec model:
# pretrained_w2v = api.load('word2vec-google-news-300')


In [50]:
'''
limiting the maximum review length to 20 by truncating longer reviews and padding
shorter reviews with a null value (0)
'''
# Average word2Vec vectors
def average_vectors_rnn(review):
    temp_review = review.split(' ')
        
#     review_vector = []
#     for word in words:
#         review_vector.append()
#     review_vector = np.array(review_vector)

    review_vector = np.array([pretrained_w2v[word] for word in temp_review[:20] if word in pretrained_w2v])
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(review_vector)==0:
        review_vector = np.zeros((20, 300))
    
    # In the case where the total dim of the feature vector is <20 add the padding with zeros
    elif len(review_vector)<20:
        review_vector = np.concatenate([review_vector, np.zeros((20-len(review_vector), 300))])
        
    return review_vector
    
def featurization_rnn(dataset):
    features = []

    for review in dataset['review_body']:
        x = average_vectors_rnn(review)
        features.append(x)
        
    return features

# Review Vectors for  first 20 words each
w2v_pretrain_train_x = featurization_rnn(train)
w2v_pretrain_train_y = train['label']
w2v_pretrain_test_x = featurization_rnn(test)
w2v_pretrain_test_y = test['label']



- Task 5(a): Train a simple RNN for sentiment analysis.


In [61]:
class rnn_model(nn.Module):
    def __init__(self):
        super(rnn_model, self).__init__()

        # self.hidden_size = hidden_size

        # self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)
        self.rnn_layer = nn.RNN(300, 20, batch_first = True)
        
        self.fc = nn.Linear(20,3)

    def forward(self, input):
        output = input.view(-1,20,300)
        output, hidden = self.rnn_layer(output)
        output=self.fc(output[:,-1,:])
                
        # hidden = self.i2h(combined)
        # output = self.i2o(combined)
        # output = self.softmax(output)
        return output

model = rnn_model()
print(model)

rnn_model(
  (rnn_layer): RNN(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=3, bias=True)
)


In [51]:
train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_x), torch.LongTensor(w2v_pretrain_train_y))
test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_x), torch.LongTensor(w2v_pretrain_test_y))

# Data Loader
train_batch_size=256
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=256
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

In [59]:
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# number of epochs to train the model
n_epochs = 20

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))
    
model.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, target-1)
        ypred = output.argmax(dim = 1)
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
        
print("Accuracy Values for each class")

print(classification_report(main_tar, predss, digits=6))

Epoch: 1 	Training Loss: 0.979274 	Validation Loss: 0.980452 	Epoch Accuracy: 0.511667
Epoch: 2 	Training Loss: 0.972369 	Validation Loss: 0.978157 	Epoch Accuracy: 0.514167
Epoch: 3 	Training Loss: 0.972523 	Validation Loss: 0.977714 	Epoch Accuracy: 0.509667
Epoch: 4 	Training Loss: 0.972811 	Validation Loss: 0.974993 	Epoch Accuracy: 0.511833
Epoch: 5 	Training Loss: 0.970838 	Validation Loss: 0.973990 	Epoch Accuracy: 0.509417
Epoch: 6 	Training Loss: 0.968697 	Validation Loss: 0.971937 	Epoch Accuracy: 0.514000
Epoch: 7 	Training Loss: 0.973769 	Validation Loss: 0.978835 	Epoch Accuracy: 0.507667
Epoch: 8 	Training Loss: 0.971384 	Validation Loss: 0.974247 	Epoch Accuracy: 0.510000
Epoch: 9 	Training Loss: 0.969083 	Validation Loss: 0.973396 	Epoch Accuracy: 0.509167
Epoch: 10 	Training Loss: 0.969155 	Validation Loss: 0.974671 	Epoch Accuracy: 0.511167
Epoch: 11 	Training Loss: 0.969270 	Validation Loss: 0.978291 	Epoch Accuracy: 0.514333
Epoch: 12 	Training Loss: 0.972533 	Valid

Task 5(b): Considering a gated recurrent unit cell.

In [64]:
class gru_model(nn.Module):
    def __init__(self):
        super(gru_model, self).__init__()

        # self.hidden_size = hidden_size

        # self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)
        self.gru_layer = nn.GRU(300, 20, batch_first = True)
        
        self.fc = nn.Linear(20,3)

    def forward(self, input):
        output = input.view(-1,20,300)
        output, hidden = self.gru_layer(output)
        output=self.fc(output[:,-1,:])
                
        # hidden = self.i2h(combined)
        # output = self.i2o(combined)
        # output = self.softmax(output)
        return output

model_gru = gru_model()
print(model_gru)

gru_model(
  (gru_layer): GRU(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=3, bias=True)
)


In [65]:
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model_gru.parameters(), lr=0.001)

# number of epochs to train the model
n_epochs = 20

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_gru.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_gru(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_gru.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_gru(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))
    
model_gru.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_gru(data)
        # calculate the loss
        loss = criterion(output, target-1)
        ypred = output.argmax(dim = 1)
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
        
print("Accuracy Values for each class")

print(classification_report(main_tar, predss, digits=6))

Epoch: 1 	Training Loss: 1.046855 	Validation Loss: 0.914843 	Epoch Accuracy: 0.547333
Epoch: 2 	Training Loss: 0.874030 	Validation Loss: 0.859234 	Epoch Accuracy: 0.601833
Epoch: 3 	Training Loss: 0.825780 	Validation Loss: 0.819515 	Epoch Accuracy: 0.629833
Epoch: 4 	Training Loss: 0.799137 	Validation Loss: 0.795593 	Epoch Accuracy: 0.639417
Epoch: 5 	Training Loss: 0.785343 	Validation Loss: 0.796183 	Epoch Accuracy: 0.636333
Epoch: 6 	Training Loss: 0.772140 	Validation Loss: 0.779161 	Epoch Accuracy: 0.649917
Epoch: 7 	Training Loss: 0.762275 	Validation Loss: 0.772579 	Epoch Accuracy: 0.651083
Epoch: 8 	Training Loss: 0.753916 	Validation Loss: 0.774624 	Epoch Accuracy: 0.652333
Epoch: 9 	Training Loss: 0.746189 	Validation Loss: 0.767008 	Epoch Accuracy: 0.660583
Epoch: 10 	Training Loss: 0.739877 	Validation Loss: 0.766179 	Epoch Accuracy: 0.658667
Epoch: 11 	Training Loss: 0.734509 	Validation Loss: 0.765252 	Epoch Accuracy: 0.663250
Epoch: 12 	Training Loss: 0.727795 	Valid

- Task 5(c): Considering a  LSTM unit cell.

In [68]:
class lstm_model(nn.Module):
    def __init__(self):
        super(lstm_model, self).__init__()

        # self.hidden_size = hidden_size

        # self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)
        self.lstm_layer = nn.LSTM(300, 20, batch_first = True)
        
        self.fc = nn.Linear(20,3)

    def forward(self, input):
        output = input.view(-1,20,300)
        output, hidden = self.lstm_layer(output)
        output=self.fc(output[:,-1,:])
                
        # hidden = self.i2h(combined)
        # output = self.i2o(combined)
        # output = self.softmax(output)
        return output

model_lstm = lstm_model()
print(model_lstm)

lstm_model(
  (lstm_layer): LSTM(300, 20, batch_first=True)
  (fc): Linear(in_features=20, out_features=3, bias=True)
)


In [69]:
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=0.001)

# number of epochs to train the model
n_epochs = 20

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_lstm.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_lstm(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_lstm.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_lstm(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))
    
model_lstm.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_lstm(data)
        # calculate the loss
        loss = criterion(output, target-1)
        ypred = output.argmax(dim = 1)
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
        
print("Accuracy Values for each class")

print(classification_report(main_tar, predss, digits=6))

Epoch: 1 	Training Loss: 1.047219 	Validation Loss: 0.926294 	Epoch Accuracy: 0.546333
Epoch: 2 	Training Loss: 0.883900 	Validation Loss: 0.875389 	Epoch Accuracy: 0.587417
Epoch: 3 	Training Loss: 0.842375 	Validation Loss: 0.839077 	Epoch Accuracy: 0.612833
Epoch: 4 	Training Loss: 0.820837 	Validation Loss: 0.837278 	Epoch Accuracy: 0.609333
Epoch: 5 	Training Loss: 0.805212 	Validation Loss: 0.817233 	Epoch Accuracy: 0.626500
Epoch: 6 	Training Loss: 0.791845 	Validation Loss: 0.807118 	Epoch Accuracy: 0.632083
Epoch: 7 	Training Loss: 0.777682 	Validation Loss: 0.798242 	Epoch Accuracy: 0.639583
Epoch: 8 	Training Loss: 0.768709 	Validation Loss: 0.790986 	Epoch Accuracy: 0.641417
Epoch: 9 	Training Loss: 0.759653 	Validation Loss: 0.785584 	Epoch Accuracy: 0.648583
Epoch: 10 	Training Loss: 0.750460 	Validation Loss: 0.786421 	Epoch Accuracy: 0.648917
Epoch: 11 	Training Loss: 0.744512 	Validation Loss: 0.780563 	Epoch Accuracy: 0.649917
Epoch: 12 	Training Loss: 0.738970 	Valid