In [1]:
import pandas as pd
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api
import gensim.models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import tensorflow
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset
from torchvision import transforms, utils

import gc
from sys import getsizeof

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Task 1: Dataset Generation

In [2]:
# Pretrained Word2Vec model:
pretrained_w2v = api.load('word2vec-google-news-300')

print('Check semantic similarities of the generated vectors:')
print(pretrained_w2v.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1)[0])
print('Excellent ~ Outstanding:', pretrained_w2v.similarity('excellent', 'outstanding'))
print('time ~ schedule:', pretrained_w2v.similarity('time', 'schedule'))

In [3]:
df = pd.read_csv('./data.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False)
df = df[['star_rating', 'review_body']]
class_one = df[(df['star_rating']==1) | (df['star_rating']==2)]
class_two = df[df['star_rating']==3]
class_three = df[(df['star_rating']==4) | (df['star_rating']==5)]

# del df
# gc.collect()
# df = [1]

class_one.loc[:, "label"] =1
class_two.loc[:, "label"] =2
class_three.loc[:, "label"] =3

class_one = class_one.sample(n=20000, random_state=100)
class_two = class_two.sample(n=20000, random_state=100)
class_three = class_three.sample(n=20000, random_state=100)
dataset = pd.concat([class_one, class_two, class_three])
# del class_one, class_two, class_three
# gc.collect()
# class_one = [1]
# class_two = [1]
# class_three = [1]

dataset.reset_index(drop=True)
train = dataset.sample(frac=0.8, random_state=100)
test = dataset.drop(train.index)
# del dataset
# gc.collect()
# dataset = [1]

train = train.reset_index(drop = True)
test = test.reset_index(drop = True)


del globals()['class_one'], globals()['class_two'], globals()['class_three'], globals()['dataset'], globals()['df']
gc.collect()

In [4]:


# Covert all reviews to lower case
train['review_body'] = train['review_body'].str.lower()
test['review_body'] = test['review_body'].str.lower()

'''
URL Remover code
'''
train['review_body'] = train['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
test['review_body'] = test['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

def html_tag_remover(review):
    soup = BeautifulSoup(review, 'html.parser')
    review = soup.get_text()
    return review  

train['review_body'] = train['review_body'].apply(lambda review: html_tag_remover(review))
test['review_body'] = test['review_body'].apply(lambda review: html_tag_remover(review))

'''
remove non-alphabetical characters
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))

'''
remove extra spaces
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub(' +', ' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub(' +', ' ', review))

'''
perform contractions on the reviews
'''
def expand_contractions(review):
    review = contractions.fix(review)
    return review

train['review_body'] = train['review_body'].apply(lambda review: expand_contractions(review))
test['review_body'] = test['review_body'].apply(lambda review: expand_contractions(review))

In [5]:
'''
remove the stop words AND perform lemmatization

'''
avg_len_before_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

def remove_stopwords(review):
    stop_words_english = set(stopwords.words('english'))
    review_word_tokens = word_tokenize(review)
    filtered_review = [word for word in review_word_tokens if not word in stop_words_english]
    return filtered_review

train['review_body'] = train['review_body'].apply(lambda review: remove_stopwords(review))
test['review_body'] = test['review_body'].apply(lambda review: remove_stopwords(review))

def review_lemmatize(review):
    lemmatizer = WordNetLemmatizer()
    lemmatized_review = [lemmatizer.lemmatize(word) for word in review]
    return ' '.join(lemmatized_review)    

train['review_body'] = train['review_body'].apply(lambda review: review_lemmatize(review))
test['review_body'] = test['review_body'].apply(lambda review: review_lemmatize(review))

avg_len_after_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

# Task 2: Word Embedding

In [6]:
# class MyCorpus:
#     def __init__(self):
#         self.sentences = train['review_body']
#     def __iter__(self):
#         for sentence in self.sentences:
#             yield sentence

# all_Sentences = MyCorpus()
all_Sentences = [sentence.split(' ') for sentence in train['review_body'].to_list()]

In [7]:
# Custom Word2Vec
custom_model = gensim.models.Word2Vec(all_Sentences, vector_size = 300, min_count=9, window=13)

In [8]:
print('Check semantic similarities of the generated vectors:')
print(custom_model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1)[0])
print('Excellent ~ Outstanding:', custom_model.wv.similarity('excellent', 'outstanding'))
print('time ~ schedule:', custom_model.wv.similarity('time', 'schedule'))

Check semantic similarities of the generated vectors:
('majority', 0.7271841764450073)
Excellent ~ Outstanding: 0.7602734
time ~ schedule: 0.22599068


In [9]:
del all_Sentences, custom_model
gc.collect()
all_Sentences = [1]
custom_model = [1]

In [10]:
a = dir()
a = sorted(a, key = lambda x: -getsizeof(x))
a

['contractions',
 'tensorflow',
 'avg_len_before_prepro',
 'classification_report',
 'avg_len_after_prepro',
 'expand_contractions',
 'WordNetLemmatizer',
 'torch',
 'confusion_matrix',
 'html_tag_remover',
 'remove_stopwords',
 'review_lemmatize',
 'TfidfVectorizer',
 'accuracy_score',
 'nltk',
 'pretrained_w2v',
 'BeautifulSoup',
 'TensorDataset',
 'all_Sentences',
 'word_tokenize',
 '__builtins__',
 'custom_model',
 '__builtin__',
 '__package__',
 'class_three',
 'get_ipython',
 'DataLoader',
 'Perceptron',
 '__loader__',
 'transforms',
 'LinearSVC',
 'class_one',
 'class_two',
 'getsizeof',
 'stopwords',
 '__name__',
 '__spec__',
 'Dataset',
 '__doc__',
 'dataset',
 'gensim',
 'optim',
 'train',
 'utils',
 '_i10',
 '_iii',
 'exit',
 'quit',
 'test',
 'Out',
 'SVC',
 '___',
 '_dh',
 '_i1',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_oh',
 'api',
 'In',
 '__',
 '_i',
 'df',
 'gc',
 'nn',
 'np',
 'pd',
 're',
 'F',
 '_']

In [31]:
getsizeof(class_three)
# del globals()['df']

64

# Task 3: Simple Models

In [12]:
# Average word2Vec vectors
# all_sentence_vector = pretrained_w2v
# del pretrained_w2v
# gc.collect()
# pretrained_w2v = [1]
def average_vectors(review, label):
    temp_review = review.split(' ')
    words = [word for word in temp_review if word in pretrained_w2v]
    if len(words) >=1:
        review_vector = []
        for word in words:
            review_vector.append(pretrained_w2v[word])
        return review_vector, label

def average_vectors_concat(review, label):
    temp_review = review.split(' ')
    words = [word for word in temp_review[:10] if word in pretrained_w2v]
        
    review_vector = []
    for word in words:
        review_vector.append(pretrained_w2v[word])
    review_vector = np.array(review_vector)
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(words)==0:
        review_vector = np.zeros((1, 300))
    data_vector = np.concatenate(review_vector, axis=0)
    
    # In the case where the total dim of the feature vector is <3000 add the padding with zeros
    if len(data_vector)<3000:
        data_vector = np.concatenate([data_vector, np.zeros(3000-len(data))])
    return data_vector/10, label
    
    
def featurization(dataset, concat = False):
    features = []
    y_labels = []
    concat = concat
    
    for review, label in zip(dataset['review_body'], dataset['label']):
        try:
            if not concat:
                x, y = average_vectors(review, label)
                features.append(np.mean(x, axis=0))
            else:
                x, y = average_vectors_concat(review, label)
                features.append(x)
                
            y_labels.append(y)
        
        except:
            pass
    return features, y_labels

# Vectors without concatenation
w2v_pretrain_train_x, w2v_pretrain_train_y = featurization(train)
w2v_pretrain_test_x, w2v_pretrain_test_y = featurization(test)

# Vectors with concatenation
w2v_pretrain_train_concat_x, w2v_pretrain_train_concat_y = featurization(train, True)
w2v_pretrain_test_concat_x, w2v_pretrain_test_concat_y = featurization(test, True)

In [13]:
# Temporarily Delete google vectors to free memory
del globals()['pretrained_w2v']

In [14]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(min_df = 0.001)
tfidf_X_train = tfidf_vectorizer.fit_transform(list(train['review_body']))
tfidf_X_train = pd.DataFrame(tfidf_X_train.toarray())

tfidf_X_test = tfidf_vectorizer.transform(list(test['review_body']))
tfidf_X_test = pd.DataFrame(tfidf_X_test.toarray())

tfidf_Y_train = train['label']
tfidf_Y_test = test['label']

tfidf_Y_train = tfidf_Y_train.astype('int')
tfidf_Y_test = tfidf_Y_test.astype('int')

In [15]:
# Training Perceptron Model on Average Word2Vec Features
perceptr_w2v = Perceptron(random_state = 100, eta0=0.1)
perceptr_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_test = perceptr_w2v.predict(w2v_pretrain_test_x)

# Training Perceptron Model on TF-IDF Features
perceptr_tfidf = Perceptron(random_state = 100, eta0=0.1)
perceptr_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_test = perceptr_tfidf.predict(tfidf_X_test)

target_names = ['class 1', 'class 2', 'class 3']
report_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_test, target_names=target_names, output_dict=True)
report_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_test, target_names=target_names, output_dict=True)

In [16]:
print('Accuracy values PERCEPTRON for w2v and tfidf features:')
print(report_w2v['accuracy'], report_tfidf['accuracy'])

Accuracy values PERCEPTRON for w2v and tfidf features:
0.5805374728759807 0.6170833333333333


In [17]:
# Training SVM Model on Average Word2Vec Features
svm_w2v = LinearSVC(random_state=100, max_iter=1000)
svm_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_svm_test = svm_w2v.predict(w2v_pretrain_test_x)

# Training SVM Model on TFIDF Features
svm_tfidf = LinearSVC(random_state=100, max_iter=1000)
svm_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_svm_test = svm_tfidf.predict(tfidf_X_test)

report_svm_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_svm_test, target_names=target_names, output_dict=True)
report_svm_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_svm_test, target_names=target_names, output_dict=True)

In [18]:
print('Accuracy values SVM for w2v and tfidf features:')
print(report_svm_w2v['accuracy'], report_svm_tfidf['accuracy'])

Accuracy values SVM for w2v and tfidf features:
0.627691537305959 0.6685


# Task 4: Feedforward Neural Networks

In [19]:
device = torch.device('cpu')
class MLP(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 300):
        super(MLP, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class MLP_concat(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 3000):
        super(MLP_concat, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = MLP()
model_concat = MLP_concat()
model = model
model_concat = model_concat
print(model)
print(model_concat)

MLP(
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
)
MLP_concat(
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (fc1): Linear(in_features=3000, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
)


-- Task 4(a) using the average Word2Vec vectors

In [20]:
train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_x), torch.LongTensor(w2v_pretrain_train_y))
test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_x), torch.LongTensor(w2v_pretrain_test_y))

# Data Loader
train_batch_size=256
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=256
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# number of epochs to train the model
n_epochs = 10

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == (target-1)).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 1.101030 	Validation Loss: 1.098515 	Epoch Accuracy: 0.340845
Epoch: 2 	Training Loss: 1.098866 	Validation Loss: 1.098009 	Epoch Accuracy: 0.340594
Epoch: 3 	Training Loss: 1.098262 	Validation Loss: 1.097926 	Epoch Accuracy: 0.354115
Epoch: 4 	Training Loss: 1.098007 	Validation Loss: 1.097874 	Epoch Accuracy: 0.392005
Epoch: 5 	Training Loss: 1.097834 	Validation Loss: 1.097762 	Epoch Accuracy: 0.413370
Epoch: 6 	Training Loss: 1.097672 	Validation Loss: 1.097625 	Epoch Accuracy: 0.393257
Epoch: 7 	Training Loss: 1.097471 	Validation Loss: 1.097428 	Epoch Accuracy: 0.396845
Epoch: 8 	Training Loss: 1.097252 	Validation Loss: 1.097228 	Epoch Accuracy: 0.400768
Epoch: 9 	Training Loss: 1.097035 	Validation Loss: 1.097032 	Epoch Accuracy: 0.399099
Epoch: 10 	Training Loss: 1.096799 	Validation Loss: 1.096790 	Epoch Accuracy: 0.393257


-- Test Dataset Accuracy classwise

In [21]:
test_batch_size= len(w2v_pretrain_test_x)
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model.eval() # prep model for evaluation

with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        
        mat = confusion_matrix((target-1),ypred)
        ans = mat.diagonal()/mat.sum(axis=1)
        
print("Accuracy Values for each class")

for i,acc in enumerate(ans):
    print(f"Class {i+1} : {acc: .6f}")

Accuracy Values for each class
Class 1 :  0.053611
Class 2 :  0.471475
Class 3 :  0.667943


Task 4(b) 10 word vectors concatenated

In [22]:
# del globals()['w2v_pretrain_test_x']
# del w2v_pretrain_train_x, w2v_pretrain_train_y, w2v_pretrain_test_x, w2v_pretrain_test_y

# gc.collect()
# # del df
# df = [1]
# model = [1]
# model_concat = [1]
# train_data = [1]
# test_data = [1]
# w2v_pretrain_train_x = [1]
# w2v_pretrain_train_y = [1]
# w2v_pretrain_test_x = [1]
# w2v_pretrain_test_y = [1]


In [23]:

train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_concat_x), torch.LongTensor(w2v_pretrain_train_concat_y))

test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_concat_x), torch.LongTensor(w2v_pretrain_test_concat_y))

# Data Loader
train_batch_size=256
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=256
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model_concat.parameters(), lr=0.01)

# number of epochs to train the model
n_epochs = 10

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_concat.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_concat.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 1.101299 	Validation Loss: 1.099623 	Epoch Accuracy: 0.340492
Epoch: 2 	Training Loss: 1.099337 	Validation Loss: 1.098611 	Epoch Accuracy: 0.340492
Epoch: 3 	Training Loss: 1.098579 	Validation Loss: 1.098245 	Epoch Accuracy: 0.340492
Epoch: 4 	Training Loss: 1.098271 	Validation Loss: 1.098116 	Epoch Accuracy: 0.340492
Epoch: 5 	Training Loss: 1.098149 	Validation Loss: 1.098080 	Epoch Accuracy: 0.342588
Epoch: 6 	Training Loss: 1.098097 	Validation Loss: 1.098070 	Epoch Accuracy: 0.340885
Epoch: 7 	Training Loss: 1.098076 	Validation Loss: 1.098068 	Epoch Accuracy: 0.340754
Epoch: 8 	Training Loss: 1.098062 	Validation Loss: 1.098063 	Epoch Accuracy: 0.340754
Epoch: 9 	Training Loss: 1.098053 	Validation Loss: 1.098060 	Epoch Accuracy: 0.340754
Epoch: 10 	Training Loss: 1.098042 	Validation Loss: 1.098052 	Epoch Accuracy: 0.340754


In [24]:
test_batch_size= len(w2v_pretrain_test_concat_x)
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model_concat.eval() # prep model for evaluation

with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        
        mat = confusion_matrix(target-1,ypred)
        ans = mat.diagonal()/mat.sum(axis=1)
        
print("Accuracy Values for each class")

for i,acc in enumerate(ans):
    print(f"Class {i+1} : {acc: .6f}")

Accuracy Values for each class
Class 1 :  0.000000
Class 2 :  1.000000
Class 3 :  0.000000


In [34]:
del globals()['tfidf_X_train'], globals()['tfidf_X_test'], globals()['tfidf_Y_train'], globals()['tfidf_Y_test']

del globals()['w2v_pretrain_train_x'], globals()['w2v_pretrain_train_y']
del globals()['w2v_pretrain_test_x'], globals()['w2v_pretrain_test_y']

del globals()['w2v_pretrain_train_concat_x'], globals()['w2v_pretrain_train_concat_y']
del globals()['w2v_pretrain_test_concat_x'], globals()['w2v_pretrain_test_concat_y']

del globals()['model'], globals()['model_concat'], globals()['train_data'], globals()['test_data']
del globals()['Y_pred_w2v_test'], globals()['Y_pred_tfidf_test'], globals()['Y_pred_w2v_svm_test'], globals()['Y_pred_tfidf_svm_test']

gc.collect()








530

# Task 5 Recurrent Neural Networks

In [26]:
# del model, model_concat, train_data, test_data, w2v_pretrain_train_x, w2v_pretrain_train_y, w2v_pretrain_test_x, w2v_pretrain_test_y, w2v_pretrain_train_concat_x, w2v_pretrain_train_concat_y, w2v_pretrain_test_concat_x, w2v_pretrain_test_concat_y 

# del df
# df = [1]
# model = [1]
# model_concat = [1]
# train_data = [1]
# test_data = [1]
# w2v_pretrain_train_x = [1]
# w2v_pretrain_train_y = [1]
# w2v_pretrain_test_x = [1]
# w2v_pretrain_test_y = [1]
# w2v_pretrain_train_concat_x = [1]
# w2v_pretrain_train_concat_y = [1]
# w2v_pretrain_test_concat_x = [1]
# w2v_pretrain_test_concat_y = [1]


0

In [51]:
'''
limiting the maximum review length to 20 by truncating longer reviews and padding
shorter reviews with a null value (0)
'''
# Average word2Vec vectors
def average_vectors_rnn(review, label):
    temp_review = review.split(' ')
    words = [word for word in temp_review[:20] if word in pretrained_w2v]
        
    review_vector = []
    for word in words:
        review_vector.append(pretrained_w2v[word])
    review_vector = np.array(review_vector)
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(review_vector)==0:
        review_vector = np.zeros((20, 300))
    
    # In the case where the total dim of the feature vector is <20 add the padding with zeros
    if len(review_vector)<20:
        review_vector = np.concatenate([review_vector, np.zeros((20-len(review_vector), 300))])
    return review_vector/20, label
    
    
def featurization_rnn(dataset):
    features = []
    y_labels = []
    
    for review, label in zip(dataset['review_body'], dataset['label']):
#         try:
        x, y = average_vectors_rnn(review, label)
        features.append(x)                
        y_labels.append(y)
        
#         except:
#             pass
    return features, y_labels

# Vectors without concatenation
# w2v_pretrain_train_x, w2v_pretrain_train_y = featurization_rnn(train)
w2v_pretrain_test_x, w2v_pretrain_test_y = featurization_rnn(test)


In [21]:
del w2v_pretrain_train_x, w2v_pretrain_train_y, w2v_pretrain_test_x, w2v_pretrain_test_y
gc.collect()
w2v_pretrain_train_x = [1]
w2v_pretrain_train_y = [1]
w2v_pretrain_test_x = [1]
w2v_pretrain_test_y = [1]

In [54]:
w2v_pretrain_test_x[2]

array([[-0.00522461,  0.00014877, -0.00505371, ..., -0.003125  ,
         0.00296631, -0.01054688],
       [ 0.00358887,  0.01040039, -0.00142212, ..., -0.00839844,
        -0.0010437 , -0.00712891],
       [ 0.00576172, -0.00402832, -0.00390625, ..., -0.00339355,
         0.00123901, -0.00600586],
       ...,
       [ 0.00480957, -0.00143433, -0.00541992, ...,  0.0034668 ,
         0.00522461, -0.00820313],
       [-0.00378418,  0.00168457, -0.00324707, ...,  0.0006012 ,
         0.00678711, -0.00456543],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])