In [337]:
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from sklearn.manifold import TSNE

import math
from torch.utils.data import Dataset
import itertools
import torch.nn.functional as F
import matplotlib.pyplot as plt


import gzip
import numpy as np
import random
import scipy
from collections import defaultdict
from scipy.spatial import distance
import dateutil
import os
from sklearn.metrics import roc_auc_score
import json, time
import pandas as pd
def parseData(fname):
    for l in open(fname):
        yield eval(l)
        
def readDataFull(path):
    data = []
    for line in gzip.open(path):
        d = eval(line)
        data.append(d)  
    return data

def readData_full(path):
    data = []
    for line in open(path):
        d = json.loads(line)
        data.append(d)  
    return data

In [341]:
# dataFolder = 'data/filtered_data/'
# users_file_name = dataFolder + 'filtered_users.json'
# places_file_name = dataFolder + 'filtered_places.json'
# reviews_file_name = dataFolder + 'filtered_reviews.json'

# data_user  = pd.DataFrame(json.load(open(users_file_name)))
# data_places = pd.DataFrame(json.load(open(places_file_name)))
# data_reviews  = pd.DataFrame(json.load(open(reviews_file_name)))

dataFolder = 'data/ca_final/'
users_file_name = dataFolder + 'ca_users.json'
places_file_name = dataFolder + 'ca_places.json'
reviews_file_name = dataFolder + 'ca_final_reviews.json'

data_user  = pd.DataFrame(readData_full(users_file_name))
data_places  = pd.DataFrame(readData_full(places_file_name))
data_reviews  = pd.DataFrame(readData_full(reviews_file_name))

### Construct data 

In [342]:
userIDs = {}
itemIDs = {}
interactions = []
interactionsPerUser = defaultdict(list)
userVisitedPlaces = defaultdict(set)
uniquePlaces = set()
for _i, d in data_reviews.iterrows():
    u = d['gPlusUserId']
    i = d['gPlusPlaceId']
    t = d['unixReviewTime']
    r = d['rating']
    uniquePlaces.add(i)
    #dt = dateutil.parser.parse(t)
    #t = int(dt.timestamp())
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((t,u,i,r))
    interactionsPerUser[u].append((t,i,r))
    userVisitedPlaces[u].add(i)
    
interactions.sort()
userInteractionAvg = sum ([len(interactionsPerUser[u]) for u in interactionsPerUser])  / len(interactionsPerUser)
print(userInteractionAvg)

itemIDs['dummy'] = len(itemIDs)


interactionstrain = []
interactionstest = []
for u in interactionsPerUser:
    interactionsPerUser[u].sort()
    list_users = interactionsPerUser[u]
    lastItem = 'dummy'
    
    for (t,i,r) in list_users[:-1]:
        interactionstrain.append((u,i,lastItem))
        lastItem = i

    (t,i,r) = list_users[-1]
    interactionstest.append((u,i,lastItem))
    lastItem = i

itemsPerUser = defaultdict(set)
for u,i,j in interactionstrain:
    itemsPerUser[u].add(i)

17.33063829787234


In [343]:
nUsers,nItems = len(userIDs),len(itemIDs)
items = list(itemIDs.keys())
print(nUsers,nItems)
print(len(interactionstrain))
print(len(interactionstest))

16450 62003
268639
16450


In [344]:
features_list = ['userId_index', 'placeId_index', 'lastplaceId_index']
features_sizes = {
    'userId_index': nUsers,
    'placeId_index':nItems,
    'lastplaceId_index':nItems
#     'age_index':len(ratings['age_index'].unique()),
#     'gender_index':len(ratings['gender_index'].unique()),
#     'occupation_index':len(ratings['occupation_index'].unique()),
}

next_offset = 0
features_offsets={}
index = 0
for k,v in features_sizes.items():
    features_offsets[index] = next_offset
    index += 1
    next_offset += v

In [345]:
x_train = []
x_train_neg = []

def feat(x):
    return [x[i] + features_offsets[i] for i in range(len(x))]

for (u,i,j) in interactionstrain:
    uindex = userIDs[u]
    iindex = itemIDs[i]
    jindex = itemIDs[j]
    x_train.append(feat((uindex, iindex, jindex)))
    
    k = random.choice(items) # negative sample
    while k in itemsPerUser[u]:
        k = random.choice(items)
    uindex = userIDs[u]
    kindex = itemIDs[k]
    jindex = itemIDs[j]   
    x_train_neg.append(feat((uindex, kindex, jindex)))


In [346]:
data_x = torch.tensor(x_train)
data_x_neg = torch.tensor(x_train_neg)
dataset = data.TensorDataset(data_x,data_x_neg)

In [363]:
bs=50000
train_n = int(len(dataset)*0.8)
valid_n = len(dataset) - train_n
splits = [train_n,valid_n]
assert sum(splits) == len(dataset)
trainset,devset = torch.utils.data.random_split(dataset,splits)
train_dataloader = data.DataLoader(trainset,batch_size=bs,shuffle=True)
dev_dataloader = data.DataLoader(devset,batch_size=bs,shuffle=True)

In [364]:
# copied from fastai: 
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [365]:
class FMModel(nn.Module):
    def __init__(self, n, k):
        super().__init__()

        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X_pos, X_neg):
        emb = self.embeddings(X_pos)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X_pos).squeeze().sum(1)
        
        pos = self.w0 + bias + pairwise
        
        emb = self.embeddings(X_neg)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X_neg).squeeze().sum(1)        
        
        neg = self.w0 + bias + pairwise
        loss = -torch.mean(torch.log(torch.sigmoid(pos - neg)))
        return loss
    
    def predict_1(self, X):
        
        emb = self.embeddings(X)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).sum(1)
        
        return self.w0 + bias + pairwise 
    def predict_2(self, X):
        
        emb = self.embeddings(X)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).squeeze().sum(1) 
        
        return self.w0 + bias + pairwise 
        #return pos - neg
    
        #return torch.mean(torch.log(torch.sigmoid(pos - neg)))
    
        #return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_uij - x_ukj)))

In [366]:
# fit/test functions
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x_pos,x_neg in iterator:
        optimizer.zero_grad()
        loss = model(x_pos, x_neg)
        train_loss += loss.item()*x_pos.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)

def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x_pos,x_neg in iterator:                    
        with torch.no_grad():
            loss = model(x_pos, x_neg)
        train_loss += loss.item()*x_pos.shape[0]
    return train_loss / len(iterator.dataset)

In [371]:
model = FMModel(data_x.max()+1, 5)
wd=1e-5
lr=0.1
epochs=10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20], gamma=0.1)
criterion = nn.MSELoss()
for epoch in range(epochs):
    start_time = time.time()
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    secs = int(time.time() - start_time)
    print(f'epoch {epoch}. time: {secs}[s]')
    print(f'\ttrain loss: {((train_loss)):.4f}')
    print(f'\tvalidation loss: {((valid_loss)):.4f}')

epoch 0. time: 6[s]
	train loss: 0.6583
	validation loss: 0.5939
epoch 1. time: 5[s]
	train loss: 0.4992
	validation loss: 0.5120
epoch 2. time: 1[s]
	train loss: 0.3725
	validation loss: 0.4674
epoch 3. time: 6[s]
	train loss: 0.2907
	validation loss: 0.4516
epoch 4. time: 6[s]
	train loss: 0.2460
	validation loss: 0.4488
epoch 5. time: 1[s]
	train loss: 0.2306
	validation loss: 0.4494
epoch 6. time: 6[s]
	train loss: 0.2283
	validation loss: 0.4472
epoch 7. time: 6[s]
	train loss: 0.2270
	validation loss: 0.4438
epoch 8. time: 5[s]
	train loss: 0.2244
	validation loss: 0.4433
epoch 9. time: 1[s]
	train loss: 0.2230
	validation loss: 0.4442


In [372]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for u,i,j in interactionstest:
    interactionsTestPerUser[u].add((i,j))
    itemSet.add(i)
    itemSet.add(j)
    
def AUCu(model, u, N):
    win = 0
    positive = [random.sample(interactionsTestPerUser[u],1)[0]] * N
    negative = random.sample(itemSet,N)
    for (i,j),k in zip(positive,negative):
        pos1 = np.array([feat((userIDs[u], itemIDs[i], itemIDs[j]))])
        neg1 = np.array([feat((userIDs[u], itemIDs[k], itemIDs[j]))])
        p1 =  torch.LongTensor(pos1)
        n1 =  torch.LongTensor(neg1)
        sp = model.predict_1(p1).item()
        sn = model.predict_1(n1).item()
        #sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j])
        #sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j])
        if sp > sn:
            win += 1
    return win/N

def AUC(model):
    av = []
    cnt = 0
    for u in interactionsTestPerUser:
        if cnt % 5000 == 0:
            print(cnt)
        cnt += 1
        av.append(AUCu(model, u, 10))
    return sum(av) / len(av)
AUC(model)

0
5000
10000
15000


0.627057750759867