In [1]:
import pandas as pd
import numpy as np
import nltk
import warnings
warnings.filterwarnings("ignore")
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api
import gensim.models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import gc
from sys import getsizeof

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Task 1: Dataset Generation

In [2]:
df = pd.read_csv('./data.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False)
df = df[['star_rating', 'review_body']]
class_one = df[(df['star_rating']==1) | (df['star_rating']==2)]
class_two = df[df['star_rating']==3]
class_three = df[(df['star_rating']==4) | (df['star_rating']==5)]

class_one.loc[:, "label"] =1
class_two.loc[:, "label"] =2
class_three.loc[:, "label"] =3

class_one = class_one.sample(n=20000, random_state=100)
class_two = class_two.sample(n=20000, random_state=100)
class_three = class_three.sample(n=20000, random_state=100)
dataset = pd.concat([class_one, class_two, class_three])

dataset.reset_index(drop=True)
train = dataset.sample(frac=0.8, random_state=100)
test = dataset.drop(train.index)

train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)

del globals()['class_one'], globals()['class_two'], globals()['class_three'], globals()['dataset'], globals()['df']
train = test = df = dataset = [[99999, 99999]]
del df, train, test, dataset
gc.collect()

0

In [3]:
# Pretrained Word2Vec model:
pretrained_w2v = api.load('word2vec-google-news-300')

In [4]:
print('Check semantic similarities of the generated vectors:')
print(pretrained_w2v.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1))
print('Excellent ~ Outstanding:', pretrained_w2v.similarity('excellent', 'outstanding'))
print('time ~ schedule:', pretrained_w2v.similarity('time', 'schedule'))

Check semantic similarities of the generated vectors:
[('queen', 0.7118193507194519)]
Excellent ~ Outstanding: 0.55674857
time ~ schedule: 0.26993576


In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
# Covert all reviews to lower case
train['review_body'] = train['review_body'].str.lower()
test['review_body'] = test['review_body'].str.lower()

'''
URL Remover code
'''
train['review_body'] = train['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
test['review_body'] = test['review_body'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

def html_tag_remover(review):
    soup = BeautifulSoup(review, 'html.parser')
    review = soup.get_text()
    return review  

train['review_body'] = train['review_body'].apply(lambda review: html_tag_remover(review))
test['review_body'] = test['review_body'].apply(lambda review: html_tag_remover(review))

'''
remove non-alphabetical characters
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub('[^a-zA-Z]+',' ', review))

'''
remove extra spaces
'''
train['review_body'] = train['review_body'].apply(lambda review: re.sub(' +', ' ', review))
test['review_body'] = test['review_body'].apply(lambda review: re.sub(' +', ' ', review))

'''
perform contractions on the reviews
'''
def expand_contractions(review):
    review = contractions.fix(review)
    return review

train['review_body'] = train['review_body'].apply(lambda review: expand_contractions(review))
test['review_body'] = test['review_body'].apply(lambda review: expand_contractions(review))

In [7]:
'''
remove the stop words AND perform lemmatization

'''
avg_len_before_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

def remove_stopwords(review):
    stop_words_english = set(stopwords.words('english'))
    review_word_tokens = word_tokenize(review)
    filtered_review = [word for word in review_word_tokens if not word in stop_words_english]
    return filtered_review

train['review_body'] = train['review_body'].apply(lambda review: remove_stopwords(review))
test['review_body'] = test['review_body'].apply(lambda review: remove_stopwords(review))

def review_lemmatize(review):
    lemmatizer = WordNetLemmatizer()
    lemmatized_review = [lemmatizer.lemmatize(word) for word in review]
    return ' '.join(lemmatized_review)    

train['review_body'] = train['review_body'].apply(lambda review: review_lemmatize(review))
test['review_body'] = test['review_body'].apply(lambda review: review_lemmatize(review))

avg_len_after_prepro = (train['review_body'].str.len().sum() + test['review_body'].str.len().sum())/60000

# Task 2: Word Embedding

In [8]:
all_Sentences = [sentence.split(' ') for sentence in train['review_body'].to_list()]

In [9]:
# Custom Word2Vec
custom_model = gensim.models.Word2Vec(all_Sentences, vector_size = 300, min_count=9, window=13)

In [10]:
print('Check semantic similarities of the generated vectors:')
print(custom_model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn = 1)[0])
print('Excellent ~ Outstanding:', custom_model.wv.similarity('excellent', 'outstanding'))
print('time ~ schedule:', custom_model.wv.similarity('time', 'schedule'))

Check semantic similarities of the generated vectors:
('touted', 0.7799292802810669)
Excellent ~ Outstanding: 0.70792377
time ~ schedule: 0.17524442


In [11]:
del all_Sentences, custom_model
gc.collect()
all_Sentences = [1]
custom_model = [1]

In [12]:
# a = dir()
# a = sorted(a, key = lambda x: -getsizeof(x))
# a

# Task 3: Simple Models

In [14]:
# Average word2Vec vectors
# all_sentence_vector = pretrained_w2v
# del pretrained_w2v
# gc.collect()
# pretrained_w2v = [1]
def average_vectors(review, label):
    temp_review = review.split(' ')
    review_vector = np.array([pretrained_w2v[word] for word in temp_review if word in pretrained_w2v])
    if len(review_vector) >=1:
#         review_vector = []
#         for word in words:
#             review_vector.append(pretrained_w2v[word])
        return review_vector, label

def average_vectors_concat(review, label):
    temp_review = review.split(' ')
    review_vector = np.array([pretrained_w2v[word] for word in temp_review[:10] if word in pretrained_w2v])
        
#     review_vector = []
#     for word in words:
#         review_vector.append(pretrained_w2v[word])
#     review_vector = np.array(review_vector)
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(review_vector)==0:
        review_vector = np.zeros((1, 300))
    review_vector = np.concatenate(review_vector, axis=0)
    
    # In the case where the total dim of the feature vector is <3000 add the padding with zeros
    if len(review_vector)<3000:
        review_vector = np.concatenate([review_vector, np.zeros(3000-len(data))])
    return review_vector/10, label
    
    
def featurization(dataset, concat = False):
    features = []
    y_labels = []
    concat = concat
    
    for review, label in zip(dataset['review_body'], dataset['label']):
        try:
            if not concat:
                x, y = average_vectors(review, label)
                features.append(np.mean(x, axis=0))
            else:
                x, y = average_vectors_concat(review, label)
                features.append(x)
                
            y_labels.append(y)
        
        except:
            pass
    return features, y_labels

# Vectors without concatenation
w2v_pretrain_train_x, w2v_pretrain_train_y = featurization(train)
w2v_pretrain_test_x, w2v_pretrain_test_y = featurization(test)

# Vectors with concatenation
w2v_pretrain_train_concat_x, w2v_pretrain_train_concat_y = featurization(train, True)
w2v_pretrain_test_concat_x, w2v_pretrain_test_concat_y = featurization(test, True)

In [15]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(min_df = 0.001)
tfidf_X_train = tfidf_vectorizer.fit_transform(list(train['review_body']))
tfidf_X_train = pd.DataFrame(tfidf_X_train.toarray())

tfidf_X_test = tfidf_vectorizer.transform(list(test['review_body']))
tfidf_X_test = pd.DataFrame(tfidf_X_test.toarray())

tfidf_Y_train = train['label']
tfidf_Y_test = test['label']

tfidf_Y_train = tfidf_Y_train.astype('int')
tfidf_Y_test = tfidf_Y_test.astype('int')

In [16]:
# Training Perceptron Model on Average Word2Vec Features
perceptr_w2v = Perceptron(random_state = 100, eta0=0.1)
perceptr_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_test = perceptr_w2v.predict(w2v_pretrain_test_x)

# Training Perceptron Model on TF-IDF Features
perceptr_tfidf = Perceptron(random_state = 100, eta0=0.1)
perceptr_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_test = perceptr_tfidf.predict(tfidf_X_test)

target_names = ['class 1', 'class 2', 'class 3']
report_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_test, target_names=target_names, output_dict=True)
report_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_test, target_names=target_names, output_dict=True)

In [17]:
print('Accuracy values PERCEPTRON for w2v and tfidf features:')
print(report_w2v['accuracy'], report_tfidf['accuracy'])

Accuracy values PERCEPTRON for w2v and tfidf features:
0.5805374728759807 0.6170833333333333


In [18]:
# Training SVM Model on Average Word2Vec Features
svm_w2v = LinearSVC(random_state=100, max_iter=1000)
svm_w2v.fit(w2v_pretrain_train_x, w2v_pretrain_train_y)
Y_pred_w2v_svm_test = svm_w2v.predict(w2v_pretrain_test_x)

# Training SVM Model on TFIDF Features
svm_tfidf = LinearSVC(random_state=100, max_iter=1000)
svm_tfidf.fit(tfidf_X_train, tfidf_Y_train)
Y_pred_tfidf_svm_test = svm_tfidf.predict(tfidf_X_test)

report_svm_w2v = classification_report(w2v_pretrain_test_y, Y_pred_w2v_svm_test, target_names=target_names, output_dict=True)
report_svm_tfidf = classification_report(tfidf_Y_test, Y_pred_tfidf_svm_test, target_names=target_names, output_dict=True)

In [19]:
print('Accuracy values SVM for w2v and tfidf features:')
print(report_svm_w2v['accuracy'], report_svm_tfidf['accuracy'])

Accuracy values SVM for w2v and tfidf features:
0.627691537305959 0.6685


In [20]:
del globals()['pretrained_w2v']

# Task 4: Feedforward Neural Networks

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset

In [22]:
device = torch.device('cpu')

In [23]:

class MLP(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 300):
        super(MLP, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
class MLP_concat(nn.Module):
    def __init__(self, classification = "binary", vocab_size = 3000):
        super(MLP_concat, self).__init__()
        hidden_1 = 100
        hidden_2 = 10
        self.fc1 = nn.Linear(vocab_size, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        if classification == "binary":
            self.fc3 = nn.Linear(hidden_2, 3)
        else:
            # For multi-classification
            self.fc3 = nn.Linear(hidden_2, 4)  
        
#         self.sig = nn.Sigmoid()
#         self.soft = nn.Softmax(dim = 1)
    
    
    def forward(self, x):
        x = x.view(-1, x.shape[1])
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = MLP()
model_concat = MLP_concat()
model = model
model_concat = model_concat
print(model)
print(model_concat)

MLP(
  (fc1): Linear(in_features=300, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
)
MLP_concat(
  (fc1): Linear(in_features=3000, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
)


-- Task 4(a) using the average Word2Vec vectors

In [24]:
train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_x), torch.LongTensor(w2v_pretrain_train_y))
test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_x), torch.LongTensor(w2v_pretrain_test_y))

# Data Loader
train_batch_size=16
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=16
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# number of epochs to train the model
n_epochs = 1

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == (target-1)).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 1.098787 	Validation Loss: 1.096163 	Epoch Accuracy: 0.402020


-- Test Dataset Accuracy classwise

In [25]:
# test_batch_size= len(w2v_pretrain_test_x)
# test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = criterion(output, (target-1))
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
        
#         mat = confusion_matrix((target-1),ypred)
#         ans = mat.diagonal()/mat.sum(axis=1)
        
# print("Accuracy Values for each class")

# for i,acc in enumerate(ans):
#     print(f"Class {i+1} : {acc: .6f}")
print(classification_report(main_tar, predss))

              precision    recall  f1-score   support

           0       0.58      0.08      0.14      4085
           1       0.35      0.75      0.48      3979
           2       0.51      0.39      0.44      3918

    accuracy                           0.40     11982
   macro avg       0.48      0.40      0.35     11982
weighted avg       0.48      0.40      0.35     11982



Task 4(b) 10 word vectors concatenated

In [26]:
        # del globals()['w2v_pretrain_test_x']
# del w2v_pretrain_train_x, w2v_pretrain_train_y, w2v_pretrain_test_x, w2v_pretrain_test_y

# gc.collect()
# # del df
# df = [1]
# model = [1]
# model_concat = [1]
# train_data = [1]
# test_data = [1]
# w2v_pretrain_train_x = [1]
# w2v_pretrain_train_y = [1]
# w2v_pretrain_test_x = [1]
# w2v_pretrain_test_y = [1]


In [27]:

train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_concat_x), torch.LongTensor(w2v_pretrain_train_concat_y))

test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_concat_x), torch.LongTensor(w2v_pretrain_test_concat_y))

# Data Loader
train_batch_size=16
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=16
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model_concat.parameters(), lr=0.002)

# number of epochs to train the model
n_epochs = 1

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_concat.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_concat.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))

Epoch: 1 	Training Loss: 0.978633 	Validation Loss: 0.925617 	Epoch Accuracy: 0.551729


In [28]:
# test_batch_size= len(w2v_pretrain_test_concat_x)
# test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)
model_concat.eval() # prep model for evaluation
main_tar = []
predss = []
with torch.no_grad():
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        #valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        #correct += (ypred == target-1).float().sum()
        for i in np.array(target-1):
            main_tar.append(i)
        for j in np.array(ypred):
            predss.append(j)
#         mat = confusion_matrix(target-1,ypred)
#         ans = mat.diagonal()/mat.sum(axis=1)
        
print("Accuracy Values for each class")

# for i,acc in enumerate(ans):
#     print(f"Class {i+1} : {acc: .6f}")
print(classification_report(main_tar, predss))

Accuracy Values for each class
              precision    recall  f1-score   support

           0       0.55      0.61      0.58      2600
           1       0.51      0.43      0.47      2602
           2       0.59      0.61      0.60      2434

    accuracy                           0.55      7636
   macro avg       0.55      0.55      0.55      7636
weighted avg       0.55      0.55      0.55      7636



In [29]:
del globals()['tfidf_X_train'], globals()['tfidf_X_test'], globals()['tfidf_Y_train'], globals()['tfidf_Y_test']

del globals()['w2v_pretrain_train_x'], globals()['w2v_pretrain_train_y']
del globals()['w2v_pretrain_test_x'], globals()['w2v_pretrain_test_y']

del globals()['w2v_pretrain_train_concat_x'], globals()['w2v_pretrain_train_concat_y']
del globals()['w2v_pretrain_test_concat_x'], globals()['w2v_pretrain_test_concat_y']

del globals()['model'], globals()['model_concat'], globals()['train_data'], globals()['test_data']
del globals()['Y_pred_w2v_test'], globals()['Y_pred_tfidf_test'], globals()['Y_pred_w2v_svm_test'], globals()['Y_pred_tfidf_svm_test']

del globals()['train_loader'], globals()['test_loader']
del globals()['main_tar'], globals()['predss']



gc.collect()

0

# Task 5 Recurrent Neural Networks

In [30]:
# Pretrained Word2Vec model:
pretrained_w2v = api.load('word2vec-google-news-300')


In [32]:
'''
limiting the maximum review length to 20 by truncating longer reviews and padding
shorter reviews with a null value (0)
'''
# Average word2Vec vectors
def average_vectors_rnn(review):
    temp_review = review.split(' ')
   
        
#     review_vector = []
#     for word in words:
#         review_vector.append()
#     review_vector = np.array(review_vector)

    review_vector = np.array([pretrained_w2v[word] for word in temp_review[:20] if word in pretrained_w2v])
    
    # can be the case where the words in the review are not found in the W2V vocabulary
    if len(review_vector)==0:
        review_vector = np.zeros((20, 300))
    
    # In the case where the total dim of the feature vector is <20 add the padding with zeros
    elif len(review_vector)<20:
        review_vector = np.concatenate([review_vector, np.zeros((20-len(review_vector), 300))])
        
    return review_vector
    
    
def featurization_rnn(dataset):
    features = []

    for review in dataset['review_body']:
#         try:
        x = average_vectors_rnn(review)
        features.append(x)
        
#         except:
#             pass
    return features

# Vectors without concatenation
w2v_pretrain_train_x = featurization_rnn(train)
w2v_pretrain_train_y = train['label']
w2v_pretrain_test_x, w2v_pretrain_test_y = featurization_rnn(test)


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Ayan\AppData\Local\Programs\Python\Python310\lib\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Ayan\AppData\Local\Temp\ipykernel_13376\3504712409.py", line 43, in <cell line: 43>
    w2v_pretrain_test_x, w2v_pretrain_test_y = featurization_rnn(test)
  File "C:\Users\Ayan\AppData\Local\Temp\ipykernel_13376\3504712409.py", line 33, in featurization_rnn
    x = average_vectors_rnn(review)
  File "C:\Users\Ayan\AppData\Local\Temp\ipykernel_13376\3504712409.py", line 23, in average_vectors_rnn
    review_vector = np.concatenate([review_vector, np.zeros((20-len(review_vector), 300))])
  File "<__array_function__ internals>", line 200, in concatenate
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 46.9 KiB for an array with shape (20, 300) and data type float64

During handling of the above exception, another exception occurred:

Tra

In [33]:

# del globals()['w2v_pretrain_train_x'], globals()['w2v_pretrain_train_y']
# del globals()['w2v_pretrain_test_x'], globals()['w2v_pretrain_test_y']
# del globals()['perceptr_w2v'], globals()['perceptr_tfidf'], globals()['svm_w2v'], globals()['svm_tfidf']




gc.collect()

360

In [None]:
class rnn_model(nn.Module):
    def __init__(self):
        super(modelRNN, self).__init__()

        # self.hidden_size = hidden_size

        # self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # self.i2o = nn.Linear(input_size + hidden_size, output_size)
        # self.softmax = nn.LogSoftmax(dim=1)
        self.rnn = nn.RNN(300, 20, batch_first = True, nonlinearity='relu')
        
        self.fc = nn.Linear(20,5)

    def forward(self, input):
        output = input.view(-1,20,300)
        output, hidden = self.rnn(output)
        output=self.fc(output[:,-1,:])
                
        # hidden = self.i2h(combined)
        # output = self.i2o(combined)
        # output = self.softmax(output)
        return output

model = modelRNN()
print(model)

In [None]:
train_data=TensorDataset(torch.FloatTensor(w2v_pretrain_train_x), torch.LongTensor(w2v_pretrain_train_y))
test_data=TensorDataset(torch.FloatTensor(w2v_pretrain_test_x), torch.LongTensor(w2v_pretrain_test_y))

# Data Loader
train_batch_size=16
train_loader=DataLoader(train_data, batch_size=train_batch_size, shuffle=True)

test_batch_size=16
test_loader=DataLoader(test_data, batch_size=test_batch_size, shuffle=True)

# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
criterion = criterion
# specify optimizer (stochastic gradient descent) and learning rate = 0.01
optimizer = torch.optim.Adam(model_concat.parameters(), lr=0.002)

# number of epochs to train the model
n_epochs = 1

# initialize tracker for minimum validation loss
valid_loss_min = np.Inf # set initial "min" to infinity
best_acc = 0

for epoch in range(n_epochs):
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    
    # train the model #
    model_concat.train() # prep model for training
    for data, target in train_loader: # iterates upto number of batch size
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*data.size(0)
          
    # validate the model #
    model_concat.eval() # prep model for evaluation
    correct = 0
    for data, target in test_loader:
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model_concat(data)
        # calculate the loss
        loss = criterion(output, target-1)
        # update running validation loss 
        valid_loss += loss.item()*data.size(0)
        ypred = output.argmax(dim = 1)
        correct += (ypred == target-1).float().sum()
        
    # print training/validation statistics 
    # calculate average loss over an epoch
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(test_loader.dataset)
    
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tEpoch Accuracy: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss,
        correct/len(test_loader.dataset)
        ))