In [1]:
from pathlib import Path
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import gzip
import json
import io

def readGz(fname):
    gz = gzip.open(fname, 'rb')
    f = io.BufferedReader(gz)
    data = []
        
    for l in f.readlines():
        data.append(eval(l))
        
    gz.close()
    return data

In [3]:
data = readGz('data/NY_5.json.gz')

In [4]:
data = pd.DataFrame(data)

In [5]:
data.shape

(121878, 8)

In [6]:
# only use user, item, & ratings
df = data[['gPlusPlaceId', 'gPlusUserId', 'rating']]
df.columns = ['place', 'user', 'rating']

In [7]:
del data

In [8]:
df.head()

Unnamed: 0,place,user,rating
0,103654778391814923896,100000524810171549476,5.0
1,105947477166033397439,100000524810171549476,5.0
2,107098981103934500500,100000524810171549476,5.0
3,108585910849109169666,100000524810171549476,5.0
4,115453471525181677863,100000524810171549476,5.0


## Split Train/Validation

In [9]:
train, val = train_test_split(df, test_size=0.2)
train.shape, val.shape

((97502, 3), (24376, 3))

## Encode Places and Users

In [10]:
def encode_cols(train, val, cols):
    for col in cols:
        uniq = train[col].unique()
        col2idx = {val: i for i, val in enumerate(uniq)}
        train[col] = train[col].apply(lambda x: col2idx.get(x, -1))
        val[col] = val[col].apply(lambda x: col2idx.get(x, -1))
        val = val.loc[val[col] >= 0] 
    return train, val

In [11]:
train_enc, val_enc = encode_cols(train, val, ['place', 'user'])

In [12]:
len(train_enc.user.unique()), len(train_enc.place.unique())

(6104, 7233)

## Dataset and Data Loader

In [13]:
class GoogleDataset(Dataset):
    def __init__(self, X, y):
        self.user = X.user.values
        self.place = X.place.values
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.user[idx], self.place[idx], self.y[idx]

In [14]:
train_ds = GoogleDataset(train_enc[['place', 'user']], train_enc.rating.values)
valid_ds = GoogleDataset(val_enc[['place', 'user']], val_enc.rating.values)

In [15]:
batch_size = 5_000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

## Model

In [16]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_places, emb_size=100, bias=True):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.place_emb = nn.Embedding(num_places, emb_size)
        
        self.user_emb.weight.data.uniform_(0,0.05)
        self.place_emb.weight.data.uniform_(0,0.05)
        
        self.bias = bias
        if self.bias:
            self.user_bias = nn.Embedding(num_users, 1)
            self.place_bias = nn.Embedding(num_places, 1)
            
            self.user_bias.weight.data.uniform_(-0.01,0.01)
            self.place_emb.weight.data.uniform_(-0.01,0.01)
            
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.place_emb(v)
        U, V = self.dropout(U), self.dropout(V)
        if self.bias:
            b_u = self.user_bias(u).squeeze()
            b_v = self.place_bias(v).squeeze()
            return (U*V).sum(1) +  b_u  + b_v
        return (U*V).sum(1)

## Training Functions

In [17]:
def cosine_segment(start_lr, end_lr, iterations):
    i = np.arange(iterations)
    c_i = 1 + np.cos(i*np.pi/iterations)
    return end_lr + (start_lr - end_lr)/2 *c_i

def get_cosine_triangular_lr(max_lr, iterations, div_start=5, div_end=5):
    min_start, min_end = max_lr/div_start, max_lr/div_end
    iter1 = int(0.3*iterations)
    iter2 = iterations - iter1
    segs = [cosine_segment(min_start, max_lr, iter1), cosine_segment(max_lr, min_end, iter2)]
    return np.concatenate(segs)

In [18]:
def set_learning_rate(optimizer, lr):
    """Changing learning rates without creating a new optimizer"""
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [19]:
def train_epocs(model, optimzer, train_dl, valid_dl, epochs=10, max_lr=0.01):
    iterations = epochs*len(train_dl)
    lrs = get_cosine_triangular_lr(max_lr, iterations)
    vals = []
    for epoch in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        i = 0
        for u, v, y in train_dl:
            set_learning_rate(optimizer, lrs[i])
            users = u.long().cuda()
            places = v.long().cuda()
            ratings = y.float().cuda()
            y_hat = model(users, places)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            i += 1
        val_loss = val_metrics(model, valid_dl)
        vals.append(val_loss)
        if epoch % 10 == 0: 
            print(f"train loss {sum_loss/total:.3f} valid loss {val_loss:.3f}") #valid accuracy {testaccur:.5f}

In [20]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for u, v, y in valid_dl:
        users = u.long().cuda()
        places = v.long().cuda()
        ratings = y.float().cuda()
        y_hat = model(users, places)
        loss = F.mse_loss(y_hat, ratings)
        #y_pred = y_hat > 0
        #correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total #, correct/total

## Train

In [21]:
num_places, num_users = len(train_enc.place.unique()), len(train_enc.user.unique())
num_places, num_users

(7233, 6104)

In [30]:
model = MF_bias(num_users, num_places, emb_size=100, bias=False).cuda()

In [31]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [32]:
train_epocs(model, optimizer, train_dl, valid_dl, max_lr=0.005, epochs=250)

train loss 16.079 valid loss 15.810
train loss 3.595 valid loss 3.503
train loss 1.039 valid loss 1.123
train loss 0.700 valid loss 0.735
train loss 0.634 valid loss 0.644
train loss 0.619 valid loss 0.621
train loss 0.618 valid loss 0.612
train loss 0.607 valid loss 0.614
train loss 0.605 valid loss 0.611
train loss 0.599 valid loss 0.611
train loss 0.594 valid loss 0.612
train loss 0.585 valid loss 0.612
train loss 0.578 valid loss 0.613
train loss 0.568 valid loss 0.613
train loss 0.551 valid loss 0.613
train loss 0.536 valid loss 0.614
train loss 0.519 valid loss 0.614
train loss 0.497 valid loss 0.615
train loss 0.471 valid loss 0.618
train loss 0.449 valid loss 0.619
train loss 0.433 valid loss 0.623
train loss 0.415 valid loss 0.626
train loss 0.401 valid loss 0.628
train loss 0.389 valid loss 0.633
train loss 0.381 valid loss 0.635
