In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')
def unpack_dataset():
    ! wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
    ! mkdir data
    ! mv Vid* data

In [3]:
unpack_dataset()

--2020-06-24 20:33:28--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 154050105 (147M) [application/octet-stream]
Saving to: ‘Video_Games_5.json.gz’


2020-06-24 20:33:40 (13.4 MB/s) - ‘Video_Games_5.json.gz’ saved [154050105/154050105]



In [4]:
data = getDF('data/Video_Games_5.json.gz')

In [5]:
data.shape

(497577, 12)

In [6]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [7]:
data = data.loc[data.reviewTime.str.contains('2018')]

In [8]:
data.shape

(11814, 12)

In [9]:
# only use user, item, & ratings
df = data[['reviewerID', 'asin', 'overall']]
df.columns = ['user', 'item', 'rating']

In [10]:
del data

In [11]:
df.head()

Unnamed: 0,user,item,rating
98,AL529YNQ4YBFE,6050036071,5.0
2599,AHGW67EQ751LS,B00000J2W7,3.0
2704,AYXTNDD9FJBVK,B00000J97G,3.0
3104,A3ISBEPYLY8IMO,B00000JRSB,5.0
6683,A30T51SMB0UQR1,B00002DHEV,5.0


## Split Train/Validation

In [12]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((9451, 3), (2363, 3))

## Encode Places and Users

In [13]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [14]:
train_enc, val_enc = encode_cols(train, val, ['user', 'item'])

In [15]:
len(train_enc.user.unique()), len(train_enc.item.unique())

(4345, 4158)

In [16]:
train_enc.head()

Unnamed: 0,user,item,rating
471553,0,0,5.0
479288,1,1,5.0
405244,2,2,4.0
474350,3,3,3.0
267922,4,4,4.0


## Dataset and Data Loader

In [17]:
class AmazonDataset(Dataset):
    def __init__(self, X, y):
        self.user = X.user.values
        self.item = X.item.values
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.y[idx]

In [18]:
train_ds = AmazonDataset(train_enc[['item', 'user']], train_enc.rating.values)
valid_ds = AmazonDataset(val_enc[['item', 'user']], val_enc.rating.values)

In [19]:
batch_size = 5_000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [20]:
next(iter(train_dl))

[tensor([ 584, 1843, 3440,  ..., 1049, 2927,  177]),
 tensor([2891, 1640,   20,  ...,  932, 1523,  161]),
 tensor([3., 5., 5.,  ..., 5., 5., 5.], dtype=torch.float64)]

## Model

In [21]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_places, emb_size=100, bias=True):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
            
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        return (U*V).sum(1)

## Training Functions

In [22]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [23]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [24]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        if epoch % 10 == 0: 
            print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [25]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [26]:
num_items, num_users = len(train_enc.item.unique()), len(train_enc.user.unique())
num_items, num_users

(4158, 4345)

In [27]:
model = MF_bias(num_users, num_items, emb_size=100, bias=False).cuda()

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [29]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.01, epochs=250)

train loss 19.585 valid loss 19.963
train loss 17.873 valid loss 18.039
train loss 14.587 valid loss 14.466
train loss 10.110 valid loss 9.768
train loss 5.756 valid loss 5.426
train loss 2.776 valid loss 2.778
train loss 1.352 valid loss 1.746
train loss 0.820 valid loss 1.446
train loss 0.589 valid loss 1.353
train loss 0.497 valid loss 1.317
train loss 0.462 valid loss 1.303
train loss 0.426 valid loss 1.302
train loss 0.418 valid loss 1.300
train loss 0.408 valid loss 1.308
train loss 0.392 valid loss 1.312
train loss 0.385 valid loss 1.324
train loss 0.371 valid loss 1.338
train loss 0.365 valid loss 1.357
train loss 0.357 valid loss 1.372
train loss 0.362 valid loss 1.383
train loss 0.354 valid loss 1.392
train loss 0.350 valid loss 1.397
train loss 0.337 valid loss 1.410
train loss 0.339 valid loss 1.426
train loss 0.333 valid loss 1.440
