In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from collections import Counter

In [4]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [8]:
data = getDF('data/Video_Games_5.json.gz')

In [9]:
data.shape

(497577, 12)

In [11]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [12]:
# only use user, item, & ratings
df = data[['reviewerID', 'asin', 'overall']]
df.columns = ['user', 'item', 'rating']

In [15]:
del data

In [16]:
df.head()

Unnamed: 0,user,item,rating
0,A1HP7NVNPFMA4N,700026657,5.0
1,A1JGAP0185YJI6,700026657,4.0
2,A1YJWEXHQBWK2B,700026657,3.0
3,A2204E1TH211HT,700026657,2.0
4,A2RF5B5H74JLPE,700026657,5.0


## Split Train/Validation

In [17]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((398061, 3), (99516, 3))

## Encode Places and Users

In [18]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [19]:
train_enc, val_enc = encode_cols(train, val, ['user', 'item'])

In [20]:
len(train_enc.user.unique()), len(train_enc.item.unique())

(55214, 17405)

In [27]:
train_enc.head()

Unnamed: 0,user,item,rating
434442,0,0,4.0
206749,1,1,5.0
293151,2,2,5.0
470239,3,3,5.0
365848,4,4,5.0


## Dataset and Data Loader

In [28]:
class AmazonDataset(Dataset):
    def __init__(self, X, y):
        self.user = X.user.values
        self.item = X.item.values
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.y[idx]

In [29]:
train_ds = AmazonDataset(train_enc[['item', 'user']], train_enc.rating.values)
valid_ds = AmazonDataset(val_enc[['item', 'user']], val_enc.rating.values)

In [30]:
batch_size = 5_000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [31]:
next(iter(train_dl))

[tensor([20955, 53604,  1001,  ...,  9906, 52855,  7519]),
 tensor([  431,  5406, 14747,  ...,  7643,   296,  2165]),
 tensor([5., 4., 5.,  ..., 3., 5., 1.])]

## Model

In [32]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_places, emb_size=100, bias=True):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
            
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        return (U*V).sum(1)

## Training Functions

In [33]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [34]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [35]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        if epoch % 10 == 0: 
            print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [36]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [37]:
num_items, num_users = len(train_enc.item.unique()), len(train_enc.user.unique())
num_items, num_users

(17405, 55214)

In [48]:
model = MF_bias(num_users, num_items, emb_size=100, bias=False).cuda()

In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [50]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.001, epochs=250)

train loss 18.673 valid loss 18.598
train loss 13.545 valid loss 13.501
train loss 7.066 valid loss 7.423
train loss 3.797 valid loss 4.284
train loss 2.403 valid loss 2.867
train loss 1.757 valid loss 2.169
train loss 1.439 valid loss 1.796
train loss 1.266 valid loss 1.584
train loss 1.170 valid loss 1.459
train loss 1.113 valid loss 1.382
train loss 1.081 valid loss 1.333
train loss 1.059 valid loss 1.301
train loss 1.046 valid loss 1.280
train loss 1.035 valid loss 1.265
train loss 1.026 valid loss 1.255
train loss 1.021 valid loss 1.248
train loss 1.017 valid loss 1.242
train loss 1.013 valid loss 1.239
train loss 1.010 valid loss 1.235
train loss 1.005 valid loss 1.233
train loss 1.002 valid loss 1.231
train loss 0.997 valid loss 1.229
train loss 0.992 valid loss 1.227
train loss 0.987 valid loss 1.226
train loss 0.984 valid loss 1.224
