import numpy as np
# ^^^ pyforest auto-imports - don't write above this line
# HW5 - Rating prediction using Amazon's Reviews
    
In this exercise, you'll train a text classification on a **subset** of the the Amazon's Reviews dataset. 

The Amazon's Reviews dataset  contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014.


We will focus on the Home and Kitchen segment which contains ~550k reviews and can be downloaded here: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Home_and_Kitchen_5.json.gz

You will predict the rating that was given to a product from the review.

The dataset contains the following fields for each review, in JSON format:
1. "reviewerID": "A11N155CW1UV02",
1. "asin": "B000H00VBQ",
1. "reviewerName": "AdrianaM"
1. "helpful": [0, 0]
1. "reviewText": "I had big expectations because I love English TV, in particular Investigative and detective stuff but this guy is really boring. It didn't appeal to me at all."
1. "overall": 2.0
1. "summary": "A little bit boring for me"
1. "unixReviewTime": 1399075200
1. "reviewTime": "05 3, 2014"




Please note that the **only** two fields that you are allowed to use in this exercise are "reviewText" which contains the review and "overall" which contains the rating. Other than that you have the **option** to use the "asin" field which is a unique product identifier. You may (or may not :) ) find this field useful.

In [None]:
!wget -nc http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Home_and_Kitchen_5.json.gz

## General guidelines

1. You are required to implement at least two models.
1. The first should be a CNN or an RNN (or a combination) and should include the use of Glove embeddings.
1. The second model should be implemented using the transformers package and include Transfer learning concepts that were mentioned in the Lecture.
1. Pay attention to any preprocessing steps that are needed.
1. Feel free to be creative and use any method which was mentioned in the lectures (e.g., tf-idf, pos,...) extra points will be given to creativity.
1. The main criteria for evaluation is not the over-all score but rather the entire process (preprocessing, efficient training ...)





In [1]:
import pandas as pd
import zipfile
import json
import gzip
from tqdm import tqdm

In [3]:
z = zipfile.ZipFile("/Users/Adam/workspace/yandex/Y-Data/2nd Semester/NLP/Assignment 4/glove.840B.300d.zip")
glove_pd = pd.read_csv(z.open('glove.840B.300d.txt'), sep=" ", quoting=3, header=None, index_col=0)
glove_pd.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",",-0.082752,0.67204,-0.14987,-0.064983,0.056491,0.40228,0.002775,-0.3311,-0.30691,2.0817,...,-0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918,-0.37846,-0.06589
.,0.012001,0.20751,-0.12578,-0.59325,0.12525,0.15975,0.13748,-0.33157,-0.13694,1.7893,...,0.16165,-0.066737,-0.29556,0.022612,-0.28135,0.0635,0.14019,0.13871,-0.36049,-0.035
the,0.27204,-0.06203,-0.1884,0.023225,-0.018158,0.006719,-0.13877,0.17708,0.17709,2.5882,...,-0.4281,0.16899,0.22511,-0.28557,-0.1028,-0.018168,0.11407,0.13015,-0.18317,0.1323
and,-0.18567,0.066008,-0.25209,-0.11725,0.26513,0.064908,0.12291,-0.093979,0.024321,2.4926,...,-0.59396,-0.097729,0.20072,0.17055,-0.004736,-0.039709,0.32498,-0.023452,0.12302,0.3312
to,0.31924,0.06316,-0.27858,0.2612,0.079248,-0.21462,-0.10495,0.15495,-0.03353,2.4834,...,-0.12977,0.3713,0.18888,-0.004274,-0.10645,-0.2581,-0.044629,0.082745,0.097801,0.25045


In [4]:
vec_string = '0.22418134 -0.28881392 0.13854356 0.00365387 -0.12870757 0.10243822 0.061626635 0.07318011 -0.061350107 -1.3477012 0.42037755 -0.063593924 -0.09683349 0.18086134 0.23704372 0.014126852 0.170096 -1.1491593 0.31497982 0.06622181 0.024687296 0.076693475 0.13851812 0.021302193 -0.06640582 -0.010336159 0.13523154 -0.042144544 -0.11938788 0.006948221 0.13333307 -0.18276379 0.052385733 0.008943111 -0.23957317 0.08500333 -0.006894406 0.0015864656 0.063391194 0.19177166 -0.13113557 -0.11295479 -0.14276934 0.03413971 -0.034278486 -0.051366422 0.18891625 -0.16673574 -0.057783455 0.036823478 0.08078679 0.022949161 0.033298038 0.011784158 0.05643189 -0.042776518 0.011959623 0.011552498 -0.0007971594 0.11300405 -0.031369694 -0.0061559738 -0.009043574 -0.415336 -0.18870236 0.13708843 0.005911723 -0.113035575 -0.030096142 -0.23908928 -0.05354085 -0.044904727 -0.20228513 0.0065645403 -0.09578946 -0.07391877 -0.06487607 0.111740574 -0.048649278 -0.16565254 -0.052037314 -0.078968436 0.13684988 0.0757494 -0.006275573 0.28693774 0.52017444 -0.0877165 -0.33010918 -0.1359622 0.114895485 -0.09744406 0.06269521 0.12118575 -0.08026362 0.35256687 -0.060017522 -0.04889904 -0.06828978 0.088740796 0.003964443 -0.0766291 0.1263925 0.07809314 -0.023164088 -0.5680669 -0.037892066 -0.1350967 -0.11351585 -0.111434504 -0.0905027 0.25174105 -0.14841858 0.034635577 -0.07334565 0.06320108 -0.038343467 -0.05413284 0.042197507 -0.090380974 -0.070528865 -0.009174437 0.009069661 0.1405178 0.02958134 -0.036431845 -0.08625681 0.042951006 0.08230793 0.0903314 -0.12279937 -0.013899368 0.048119213 0.08678239 -0.14450377 -0.04424887 0.018319942 0.015026873 -0.100526 0.06021201 0.74059093 -0.0016333034 -0.24960588 -0.023739101 0.016396184 0.11928964 0.13950661 -0.031624354 -0.01645025 0.14079992 -0.0002824564 -0.08052984 -0.0021310581 -0.025350995 0.086938225 0.14308536 0.17146006 -0.13943303 0.048792403 0.09274929 -0.053167373 0.031103406 0.012354865 0.21057427 0.32618305 0.18015954 -0.15881181 0.15322933 -0.22558987 -0.04200665 0.0084689725 0.038156632 0.15188617 0.13274793 0.113756925 -0.095273495 -0.049490947 -0.10265804 -0.27064866 -0.034567792 -0.018810693 -0.0010360252 0.10340131 0.13883452 0.21131058 -0.01981019 0.1833468 -0.10751636 -0.03128868 0.02518242 0.23232952 0.042052146 0.11731903 -0.15506615 0.0063580726 -0.15429358 0.1511722 0.12745973 0.2576985 -0.25486213 -0.0709463 0.17983761 0.054027 -0.09884228 -0.24595179 -0.093028545 -0.028203879 0.094398156 0.09233813 0.029291354 0.13110267 0.15682974 -0.016919162 0.23927948 -0.1343307 -0.22422817 0.14634751 -0.064993896 0.4703685 -0.027190214 0.06224946 -0.091360025 0.21490277 -0.19562101 -0.10032754 -0.09056772 -0.06203493 -0.18876675 -0.10963594 -0.27734384 0.12616494 -0.02217992 -0.16058226 -0.080475815 0.026953284 0.110732645 0.014894041 0.09416802 0.14299914 -0.1594008 -0.066080004 -0.007995227 -0.11668856 -0.13081996 -0.09237365 0.14741232 0.09180138 0.081735 0.3211204 -0.0036552632 -0.047030564 -0.02311798 0.048961394 0.08669574 -0.06766279 -0.50028914 -0.048515294 0.14144728 -0.032994404 -0.11954345 -0.14929578 -0.2388355 -0.019883996 -0.15917352 -0.052084364 0.2801028 -0.0029121689 -0.054581646 -0.47385484 0.17112483 -0.12066923 -0.042173345 0.1395337 0.26115036 0.012869649 0.009291686 -0.0026459037 -0.075331464 0.017840583 -0.26869613 -0.21820338 -0.17084768 -0.1022808 -0.055290595 0.13513643 0.12362477 -0.10980586 0.13980341 -0.20233242 0.08813751 0.3849736 -0.10653763 -0.06199595 0.028849555 0.03230154 0.023856193 0.069950655 0.19310954 -0.077677034 -0.144811'
avg_vec = np.array(vec_string.split(" "))
unk_df = pd.DataFrame({"": np.zeros(300, dtype='float32'),
                       "UNK": avg_vec}).T
pretrained_weights = pd.concat([unk_df, glove_pd])
del glove_pd
del unk_df
pretrained_weights.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
UNK,0.22418134,-0.28881392,0.13854356,0.00365387,-0.12870757,0.10243822,0.061626635,0.07318011,-0.061350107,-1.3477012,...,-0.10653763,-0.06199595,0.028849555,0.03230154,0.023856193,0.069950655,0.19310954,-0.077677034,-0.144811,
",",,-0.082752,0.67204,-0.14987,-0.064983,0.056491,0.40228,0.0027747,-0.3311,-0.30691,...,-0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918,-0.37846,-0.06589
.,,0.012001,0.20751,-0.12578,-0.59325,0.12525,0.15975,0.13748,-0.33157,-0.13694,...,0.16165,-0.066737,-0.29556,0.022612,-0.28135,0.0635,0.14019,0.13871,-0.36049,-0.035
the,,0.27204,-0.06203,-0.1884,0.023225,-0.018158,0.0067192,-0.13877,0.17708,0.17709,...,-0.4281,0.16899,0.22511,-0.28557,-0.1028,-0.018168,0.11407,0.13015,-0.18317,0.1323


In [None]:
unk = pretrained_weights.loc['UNK', :].copy()

In [None]:
glove = {key: val.values for key, val in pretrained_weights.T.items()}

First we open the reviews file and turn it into a mappable object. The formatting of the text file has newline characters so we need to split the textfile on '\n' and ignore the last entry as the file ends with '\n' and therefore the last element in the list is empty.

In [None]:
with gzip.open('reviews_Home_and_Kitchen_5.json.gz', 'rt') as f:
    content = f.read()
content = content.split('\n')[:-1]
print(content[0])
print(content[-1])

Now that we have a list of JSONifiable strings we can turn our reviews into a mapping where each review has a unique ID

In [None]:
reviews = pd.DataFrame.from_records([json.loads(r) for r in content])

In [None]:
reviews.head()

Taking a look at the fields, the "asin" field could be used to create an average rating per product feature, which could help further down the line to predict a specific rating. Let's do it

In [None]:
reviews['avg_product_rating'] = reviews.groupby('asin').overall.transform('mean')
reviews = reviews[['asin', 'reviewText', 'overall', 'avg_product_rating']]
reviews.head()

Now we need to decide how to preprocess the review texts. First lets check a few stats about the reviews, like max/min length and punctuation count. Not sure yet what that will tell us but could be interesting

In [None]:
import string
from nltk.tokenize import word_tokenize
punctuation = string.punctuation
tqdm.pandas()

In [None]:
reviews['review_len'] = reviews.reviewText.progress_apply(len)
reviews['punctuation_count'] = reviews.reviewText.progress_apply(lambda x: sum([1 for c in x if c in punctuation]))

In [None]:
reviews.review_len.hist(bins=100)
print(f'Mean review length: {reviews.review_len.mean()}')
plt.show()

In [None]:
reviews.punctuation_count.hist(bins=100)
plt.show()

Seems like the distribution of review lenghts is pareto-like, so most reviews are relatively short

In [None]:
reviews.head()

We'll use the NLTK word_tokenize and tokenize each review, and check the distribution of number of tokens per review

In [None]:
reviews['tokenized_review'] = reviews.reviewText.progress_apply(word_tokenize)
reviews['token_count'] = reviews.tokenized_review.progress_apply(len)

In [None]:
reviews.head()

In [None]:
reviews.token_count.hist(bins=100)
plt.show()

In [None]:
print(f'Mean token count: {reviews.token_count.mean()}')
print(reviews.loc[reviews.token_count <= 200].token_count.size/reviews.token_count.size)

This distribution is naturally also pareto-like, which means that we can cap the maximum number of tokens we get embeddings for at a relatively low number without losing too much information. +86% of reviews are no longer than 200 tokens

We'll create embeddings from the tokenized reviews that are padded to a max length of 200 tokens. We'll do this in pure python since I don't want to extract embeddings until after the reviews are padded

In [None]:
MAX_LENGTH = 200

In [None]:
def pad_or_truncate(some_list, target_len):
    return some_list[:target_len] + ['']*(target_len - len(some_list))
reviews['tokenized_review_pad'] = reviews.tokenized_review.progress_apply(lambda x: pad_or_truncate(x, MAX_LENGTH))

In [None]:
from collections import Counter

In [None]:
token_counts = Counter()
reviews.tokenized_review.progress_apply(lambda x: token_counts.update(x))

We'll create an entry for Out of Vocabulary words using the average vector (from https://stackoverflow.com/questions/49239941/what-is-unk-in-the-pretrained-glove-vector-files-e-g-glove-6b-50d-txt)

In [None]:
vocab = [t for t, v in token_counts.items() if ]
vocab2idx = {word: i for i, word in enumerate(vocab)}
embeddings = pd.DataFrame(vocab2idx).T.merge(pretrained_weights, left_index=True, right_index=True)

In [None]:
# reviews[['reviewText', 'overall']].to_csv('amazon_for_torchtext.csv')

In [None]:
import torch
from torch import nn
# from torch.utils.data import Dataset, DataLoader
from torchtext import data   

In [None]:
SEED = 42
torch.manual_seed(SEED)

In [None]:
!python -m spacy download en

In [None]:
TEXT = data.Field(tokenize='spacy', tokenizer_language='en', batch_first=True, include_lengths=True, fix_length=100)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

In [None]:
fields = [('overall', LABEL), ('reviewText',TEXT)]
#loading custom dataset
training_data=data.TabularDataset(path='amazon_for_torchtext.csv', format='csv', fields=fields, skip_header=True, )

#print preprocessed text
print(vars(training_data.examples[0]))

In [None]:
#print preprocessed text
print(vars(training_data.examples[5000]))

In [None]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7, random_state=random.seed(SEED))

In [None]:
#initialize glove embeddings
TEXT.build_vocab(train_data, min_freq=3, vectors="glove.840B.300d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

In [None]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.reviewText),
    sort_within_batch=True,
    device = device)

In [None]:
class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 6
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [None]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.reviewText   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.overall)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.overall)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.reviewText
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.overall)
            acc = binary_accuracy(predictions, batch.overall)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')