In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def unpack_dataset():
    ! wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
    ! mkdir data
    ! mv Vid* data

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
unpack_dataset()


--2020-06-24 05:26:35--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 154050105 (147M) [application/octet-stream]
Saving to: ‘Video_Games_5.json.gz’


2020-06-24 05:26:43 (17.7 MB/s) - ‘Video_Games_5.json.gz’ saved [154050105/154050105]

mkdir: cannot create directory ‘data’: File exists


In [4]:
data = getDF('data/Video_Games_5.json.gz')

In [5]:
data.shape

(497577, 12)

In [6]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [7]:
data = data.loc[(data.reviewTime.str.contains('2018'))]# | (data.reviewTime.str.contains('2016')) | (data.reviewTime.str.contains('2017')) | (data.reviewTime.str.contains('2018'))]

In [8]:
# now use user, item, reviewText & ratings
df = data[['reviewerID', 'asin', 'reviewText', 'overall']]
df.columns = ['place', 'user', 'review', 'rating']

In [9]:
del data

In [10]:
df.head()

Unnamed: 0,place,user,review,rating
98,AL529YNQ4YBFE,6050036071,Works great with Xbox 360. I have 2 of these u...,5.0
2599,AHGW67EQ751LS,B00000J2W7,"It won't autosave, and it resets progress on r...",3.0
2704,AYXTNDD9FJBVK,B00000J97G,Missing battery cover,3.0
3104,A3ISBEPYLY8IMO,B00000JRSB,Received this in perfect condition. Great rese...,5.0
6683,A30T51SMB0UQR1,B00002DHEV,one of the best video game systems ever made.....,5.0


## Split Train/Validation

In [11]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((9451, 4), (2363, 4))

In [12]:
del df

## Encode Places and Users

In [13]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [14]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [15]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(4185, 4355)

In [16]:
train_enc.head()

Unnamed: 0,place,user,review,rating
494397,0,0,"Let's start by stating that even in 2018, you ...",5.0
444712,1,1,a ok,5.0
495561,2,2,nice,4.0
431940,3,3,Love them.,5.0
337122,4,4,"Good, pretty, cheap",5.0


## Get text from reviews, create vocab

In [17]:
place_reviews = train_enc.groupby('place').review.apply(list)

In [18]:
len(place_reviews), sum([any(review_list) for review_list in place_reviews.values])

(4355, 4355)

In [19]:
# 0 places without any reviews our ot 55217
' '.join(place_reviews[0])

"Let's start by stating that even in 2018, you will need a good computer to run this game and make it beautiful.\nIf you are equipped, I do recommend it highly because of the story and immersion.\nThe world is fairly large, the enemies sometimes fearsome and even more than a year after finishing it, I have some temptation to go and visit it again.\nIt is an open world survival game.\nYou will do first person shooting, settlements constructions so it can appeal to many different types of players.\nEven in 2018, it is a great game to start. It looks fantastic!"

In [20]:
place_reviews = place_reviews.apply(lambda x: ' '.join([str(rev) for rev in x]))

In [21]:
place_reviews.head(2)

place
0    Let's start by stating that even in 2018, you ...
1                                       a  ok the best
Name: review, dtype: object

In [22]:
import re

In [23]:
def tok(text):
    reg = re.findall(r"[\w']+|[.,!?;]", text, re.UNICODE)
    return [w.lower() for w in reg]

In [24]:
tok(place_reviews.iloc[0])[:20]

["let's",
 'start',
 'by',
 'stating',
 'that',
 'even',
 'in',
 '2018',
 ',',
 'you',
 'will',
 'need',
 'a',
 'good',
 'computer',
 'to',
 'run',
 'this',
 'game',
 'and']

In [25]:
all_vocab = [word for review in place_reviews.values for word in tok(review)]

In [26]:
len(all_vocab)

463215

In [27]:
word_cts = Counter(all_vocab)
len(word_cts)

15997

In [28]:
del all_vocab

In [29]:
common = {k:v for k, v in word_cts.items() if v >= 5}
vocab2idx = {'<PAD>': 0, 'UNK': 1}

for i, (word, ct) in enumerate(common.items()):
    if ct >= 5:
        vocab2idx[word] = i + 2

In [30]:
del word_cts, common

In [31]:
len(vocab2idx)

4788

## Review to Vector

In [32]:
place_reviews_tok = [tok(review) for review in place_reviews.values]

In [33]:
lens = [len(rev) for rev in place_reviews_tok]
max(lens)

4588

In [34]:
del lens

In [35]:
def encode_review(review, N=6412):
    enc1 = np.array([vocab2idx.get(w, vocab2idx["UNK"]) for w in review])
    enc = np.zeros(N, dtype=np.int32)
    l = min(N, len(review))
    enc[:l] = enc1[:l]
    if l == 0:
        l=1
    return enc, l

In [36]:
encode_review(place_reviews_tok[0])

(array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 116)

## Dataset and Data Loader

In [37]:
class AmazonDataset(Dataset):
    def __init__(self, X, y, reviews):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        self.reviews = reviews
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        user = self.user[idx]
        place = self.place[idx]
        review_vec, l = encode_review(self.reviews[place])
        return user, place, review_vec, l, self.y[idx]

In [38]:
train_ds = AmazonDataset(train_enc[['place', 'user']], train_enc.rating.values, place_reviews_tok)
valid_ds = AmazonDataset(val_enc[['place', 'user']], val_enc.rating.values, place_reviews_tok)

In [39]:
next(iter(train_ds))

(0, 0, array([2, 3, 4, ..., 0, 0, 0], dtype=int32), 116, 5.0)

## Load Pre-Trained Glove

In [40]:
def loadGloveModel(gloveFile='data/glove.6B.50d.txt'):
    """ Loads word vectors into a dictionary."""
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove*.zip
    ! mv gl* data

    f = open(gloveFile,'r')
    words = []
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        words.append(word)
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [41]:
#glove = loadGloveModel()

In [42]:
def create_embedding_matrix(word_vecs, vocab2index, emb_size=50):
    """Creates embedding matrix from word vectors. """ 
    V = len(vocab2index.keys())
    W = np.random.uniform(-0.25,0.25, (V, emb_size)) # initialize random
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    for word, index in vocab2index.items():
        if word in word_vecs:
            W[index] = word_vecs[word]
    return W

In [43]:
#max([v for k, v in vocab2idx.items()])

In [44]:
#W = create_embedding_matrix(glove, vocab2idx)

In [45]:
#del glove

## Model

In [46]:
class Context_MF_bias(nn.Module):
    def __init__(self, num_users, num_places, vocab_size,  emb_size=50, glove_weights=None, bias=True):
        super(Context_MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
        
        self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            self.embeddings.weight.requires_grad = False ## freeze embeddings
        
        # input dim and hidden dim must be the same to later combine item vec and review vec
        self.gru = nn.GRU(emb_size, emb_size, batch_first=True) 
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v, v_review, l):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        
        emb = self.embeddings(v_review)
        emb = self.dropout(emb)
        pack1 = pack_padded_sequence(emb, l, batch_first=True, enforce_sorted=False)
        _, h1 = self.gru(pack1)
        
        V_rev = h1[-1]
        
        V_ = torch.stack([V, V_rev], dim=1) # combines item vector with review vector
        V_ = torch.mean(V_, dim=1)   # still dim 50, avg at each dim from item/review vec
        
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V_).sum(1) +  b_u  + b_v
        
        return (U*V_).sum(1)

## Training Functions

In [47]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [48]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [49]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, v_rev, l, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            review_vec = v_rev.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places, review_vec, l)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        #if epoch % 10 == 0: 
        print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [50]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, v_rev, l, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        review_vec = v_rev.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places, review_vec, l)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [51]:
len(train_enc)

9451

In [52]:
batch_size = 1_500
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [53]:
num_places, num_users, V = len(train_enc.place.unique()), len(train_enc.user.unique()), len(vocab2idx.keys())
num_places, num_users, V

(4355, 4185, 4788)

In [54]:
torch.cuda.empty_cache()

In [55]:
model = Context_MF_bias(num_users, num_places, V, emb_size=50, bias=False).cuda()

In [56]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [57]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=20)

train loss 17.795 valid loss 14.188
train loss 12.570 valid loss 9.954
train loss 9.221 valid loss 7.234
train loss 6.891 valid loss 5.392
train loss 5.182 valid loss 4.099
train loss 3.899 valid loss 3.176
train loss 2.958 valid loss 2.527
train loss 2.281 valid loss 2.086
train loss 1.797 valid loss 1.792
train loss 1.428 valid loss 1.598
train loss 1.168 valid loss 1.479
train loss 0.984 valid loss 1.388
train loss 0.870 valid loss 1.361
train loss 0.775 valid loss 1.315
train loss 0.722 valid loss 1.304
train loss 0.674 valid loss 1.282
train loss 0.630 valid loss 1.270
train loss 0.614 valid loss 1.285
train loss 0.596 valid loss 1.278
train loss 0.580 valid loss 1.301


In [58]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.001, epochs=10)

train loss 0.538 valid loss 1.295
train loss 0.531 valid loss 1.290
train loss 0.533 valid loss 1.290
train loss 0.532 valid loss 1.285
train loss 0.522 valid loss 1.283
train loss 0.517 valid loss 1.281
train loss 0.519 valid loss 1.280
train loss 0.513 valid loss 1.279
train loss 0.512 valid loss 1.276
train loss 0.514 valid loss 1.275
