In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
data = getDF('data/Grocery_and_Gourmet_Food_5.json.gz')

In [4]:
data.shape

(1143860, 12)

In [5]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,
2,5.0,True,"11 21, 2015",A32RD6L701BIGP,4639725183,Krystal Clifton,"If you like strong tea, this is for you. It mi...",Strong,1448064000,,,
3,5.0,True,"08 12, 2015",A2UY1O1FBGKIE6,4639725183,U. Kane,Love the tea. The flavor is way better than th...,Great tea,1439337600,,,
4,5.0,True,"05 28, 2015",A3QHVBQYDV7Z6U,4639725183,The Nana,I have searched everywhere until I browsed Ama...,This is the tea I remembered!,1432771200,,,


In [6]:
# only use user, item, & ratings
df = data[['reviewerID', 'asin', 'overall']]
df.columns = ['user', 'item', 'rating']

In [7]:
del data

In [8]:
df.head()

Unnamed: 0,user,item,rating
0,A1QVBUH9E1V6I8,4639725183,5.0
1,A3GEOILWLK86XM,4639725183,5.0
2,A32RD6L701BIGP,4639725183,5.0
3,A2UY1O1FBGKIE6,4639725183,5.0
4,A3QHVBQYDV7Z6U,4639725183,5.0


## Split Train/Validation

In [9]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((915088, 3), (228772, 3))

## Encode Places and Users

In [10]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [11]:
train_enc, val_enc = encode_cols(train, val, ['user', 'item'])

In [12]:
len(train_enc.user.unique()), len(train_enc.item.unique())

(127486, 41311)

In [13]:
train_enc.head()

Unnamed: 0,user,item,rating
147517,0,0,5.0
1131925,1,1,5.0
136067,2,2,4.0
196440,3,3,5.0
277822,4,4,5.0


## Dataset and Data Loader

In [14]:
class AmazonDataset(Dataset):
    def __init__(self, X, y):
        self.user = X.user.values
        self.item = X.item.values
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.y[idx]

In [15]:
train_ds = AmazonDataset(train_enc[['item', 'user']], train_enc.rating.values)
valid_ds = AmazonDataset(val_enc[['item', 'user']], val_enc.rating.values)

In [16]:
batch_size = 5_000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [17]:
next(iter(train_dl))

[tensor([ 74101,  45573,  56692,  ...,  11999,  48000, 115923]),
 tensor([ 2319, 10795,  8336,  ..., 19339,  9793,  4318]),
 tensor([5., 5., 5.,  ..., 3., 3., 4.])]

## Model

In [18]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_places, emb_size=100, bias=True):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
            
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        return (U*V).sum(1)

## Training Functions

In [19]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [20]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [21]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        if epoch % 10 == 0: 
            print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [22]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [23]:
num_items, num_users = len(train_enc.item.unique()), len(train_enc.user.unique())
num_items, num_users

(41311, 127486)

In [37]:
model = MF_bias(num_users, num_items, emb_size=100, bias=False).cuda()

In [38]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [39]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.005, epochs=50)

train loss 20.027 valid loss 18.764
train loss 1.798 valid loss 2.112
train loss 1.330 valid loss 1.590
train loss 1.274 valid loss 1.543
train loss 1.227 valid loss 1.541


In [40]:
val_metrics(model, valid_dl)

1.5513048584998246