In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def unpack_dataset():
    ! wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
    ! mkdir data
    ! mv Vid* data

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
unpack_dataset()


--2020-06-24 06:35:09--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 154050105 (147M) [application/octet-stream]
Saving to: ‘Video_Games_5.json.gz’


2020-06-24 06:35:11 (84.3 MB/s) - ‘Video_Games_5.json.gz’ saved [154050105/154050105]



In [4]:
data = getDF('data/Video_Games_5.json.gz')


In [5]:
data = data.loc[(data.reviewTime.str.contains('2018'))] #| (data.reviewTime.str.contains('2016')) | (data.reviewTime.str.contains('2017')) | (data.reviewTime.str.contains('2018'))]


In [6]:
data.shape

(11814, 12)

In [7]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
98,5.0,True,"01 2, 2018",AL529YNQ4YBFE,6050036071,Juan Carlos,Works great with Xbox 360. I have 2 of these u...,Works great with Xbox 360,1514851200,,,
2599,3.0,True,"02 14, 2018",AHGW67EQ751LS,B00000J2W7,Max Werner,"It won't autosave, and it resets progress on r...",Three Stars,1518566400,,,
2704,3.0,True,"03 16, 2018",AYXTNDD9FJBVK,B00000J97G,C,Missing battery cover,Three Stars,1521158400,,{'Color:': ' Teal'},
3104,5.0,True,"01 2, 2018",A3ISBEPYLY8IMO,B00000JRSB,Jimmy,Received this in perfect condition. Great rese...,Received this in perfect condition. Great rese...,1514851200,,{'Format:': ' Video Game'},
6683,5.0,True,"04 4, 2018",A30T51SMB0UQR1,B00002DHEV,Steven,one of the best video game systems ever made.....,one of the best video game systems ever made,1522800000,,,


In [8]:
# now use user, item, reviewText & ratings
df = data[['asin', 'reviewerID', 'reviewText', 'overall']]
df.columns = ['place', 'user', 'review', 'rating']

In [9]:
del data

In [10]:
df.head()

Unnamed: 0,place,user,review,rating
98,6050036071,AL529YNQ4YBFE,Works great with Xbox 360. I have 2 of these u...,5.0
2599,B00000J2W7,AHGW67EQ751LS,"It won't autosave, and it resets progress on r...",3.0
2704,B00000J97G,AYXTNDD9FJBVK,Missing battery cover,3.0
3104,B00000JRSB,A3ISBEPYLY8IMO,Received this in perfect condition. Great rese...,5.0
6683,B00002DHEV,A30T51SMB0UQR1,one of the best video game systems ever made.....,5.0


## Split Train/Validation

In [11]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((9451, 4), (2363, 4))

In [12]:
del df

## Encode Places and Users

In [13]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [14]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [15]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(4351, 4225)

In [16]:
train_enc.head()

Unnamed: 0,place,user,review,rating
447823,0,0,"This is my favorite Nintendo game of all time,...",5.0
320902,1,1,Love it. Fits well in my hand. The programmabl...,5.0
483138,2,2,The shoulder buttons are obviously too small t...,1.0
494755,3,3,it worked,5.0
466376,4,4,i loved this game so much it was fun but reall...,5.0


## Get text from reviews, create vocab

In [17]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [18]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(4225, 4225)

In [19]:
# 261 places without any reviews our ot 10476

In [20]:
place_reviews = place_reviews.apply(lambda x: ' '.join([str(rev) for rev in x]))

In [21]:
place_reviews.head(2)

place
0    This is my favorite Nintendo game of all time,...
1    Love it. Fits well in my hand. The programmabl...
Name: review, dtype: object

In [22]:
import re

In [23]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [24]:
tok(place_reviews.iloc[0])[:20]

['this',
 'is',
 'my',
 'favorite',
 'nintendo',
 'game',
 'of',
 'all',
 'time',
 ',',
 "that's",
 'why',
 'i',
 'gave',
 'it',
 'five',
 'stars',
 ',',
 'there',
 "isn't"]

In [25]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [26]:
len(all_vocab)

455321

In [27]:
word_cts = Counter(all_vocab)
len(word_cts)

15866

In [28]:
del all_vocab

In [29]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [30]:
del word_cts, common

In [31]:
len(vocab2idx)

4751

## Review to Vector

In [32]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [33]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

17396

In [34]:
del lens

In [35]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [36]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 32)

## Dataset and Data Loader

In [37]:
class AmazonDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [38]:
train_ds = AmazonDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = AmazonDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [39]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 32, 5.0)

## Load Pre-Trained Glove

In [40]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove*.zip
    ! mv gl* data

    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [41]:
glove = loadGloveModel()

--2020-06-24 06:35:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-06-24 06:35:29--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-06-24 06:35:29--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [42]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float16')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [43]:
max([v for k, v in vocab2idx.items()])

4750

In [44]:
W = create_embedding_matrix(glove, vocab2idx)

In [45]:
del glove

## Model

In [46]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V = torch.mean(V, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        
        return (U*V).sum(1)

## Training Functions

In [47]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [48]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [49]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [50]:
def val_metrics(model, valid_dl):
    with torch.no_grad(): # reduce memory
        model.eval()
        correct = 0
        total = 0
        sum_loss = 0.0
        for u, v, v_rev, l, y in valid_dl:
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            #y_pred = y_hat > 0
            #correct += (y_pred.float() == y).float().sum()
            total += y.shape[0]
            sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train with Glove

In [51]:
len(train_enc)

9451

In [52]:
batch_size = 2_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [53]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(4225, 4351, 4751)

In [54]:
torch.cuda.empty_cache()

In [55]:
model_glove = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False, glove_weights=W).cuda()

In [56]:
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01, weight_decay=1e-5)

In [57]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=5)

train loss 19.639 valid loss 19.177
train loss 18.104 valid loss 17.498
train loss 16.441 valid loss 15.537
train loss 14.497 valid loss 13.515
train loss 12.565 valid loss 11.657


In [58]:
# unfreezing the embeddings
model_glove.embeddings.weight.requires_grad = True
parameters = filter(lambda p: p.requires_grad, model_glove.parameters())

In [59]:
train_epocs(model_glove, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=30)

train loss 10.878 valid loss 10.679
train loss 9.968 valid loss 9.750
train loss 9.078 valid loss 8.881
train loss 8.271 valid loss 8.070
train loss 7.499 valid loss 7.317
train loss 6.809 valid loss 6.623
train loss 6.142 valid loss 5.984
train loss 5.549 valid loss 5.398
train loss 4.989 valid loss 4.864
train loss 4.497 valid loss 4.386
train loss 4.051 valid loss 3.964
train loss 3.652 valid loss 3.593
train loss 3.313 valid loss 3.264
train loss 3.011 valid loss 2.974
train loss 2.719 valid loss 2.720
train loss 2.448 valid loss 2.500
train loss 2.228 valid loss 2.308
train loss 2.033 valid loss 2.140
train loss 1.875 valid loss 1.993
train loss 1.712 valid loss 1.866
train loss 1.578 valid loss 1.757
train loss 1.467 valid loss 1.663
train loss 1.369 valid loss 1.583
train loss 1.268 valid loss 1.515
train loss 1.175 valid loss 1.455
train loss 1.122 valid loss 1.403
train loss 1.065 valid loss 1.359
train loss 0.977 valid loss 1.322
train loss 0.937 valid loss 1.289
train loss 0