In [1]:
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from sklearn.manifold import TSNE

import math
from torch.utils.data import Dataset
import itertools
import torch.nn.functional as F
import matplotlib.pyplot as plt


import gzip
import numpy as np
import random
import scipy
from collections import defaultdict
from scipy.spatial import distance
import dateutil
import os
from sklearn.metrics import roc_auc_score
import json, time
import pandas as pd
def parseData(fname):
    for l in open(fname):
        yield eval(l)
        
def readDataFull(path):
    data = []
    for line in gzip.open(path):
        d = eval(line)
        data.append(d)  
    return data

def readData_full(path):
    data = []
    for line in open(path):
        d = json.loads(line)
        data.append(d)  
    return data

In [2]:
dataFolder = 'data/ca_final/'
users_file_name = dataFolder + 'ca_users.json'
places_file_name = dataFolder + 'ca_places.json'
reviews_file_name = dataFolder + 'ca_final_reviews.json'

data_user  = pd.DataFrame(readData_full(users_file_name))
data_places  = pd.DataFrame(readData_full(places_file_name))
data_reviews  = pd.DataFrame(readData_full(reviews_file_name))

### Construct data 

In [3]:
import re
places_price_dict = {}
places_hours_dict = {}

user_education = {}
user_jobs = {}
review_years = {}

cnt = 0

_uni = ['university','college', 'school','institute', 'institution','academy','polytechnic','varsity', 'academy']
def checkeducation(ed):
    ed = ed.lower()
    for x in _uni:
        if x in ed:
            return 1
    return 0

for _i, d in data_user.iterrows():
    ed = str(d['education'])
    user_education[d['gPlusUserId']] = checkeducation(ed)
    if d['jobs'] is not None:
        user_jobs[d['gPlusUserId']] = 1
    else:
        user_jobs[d['gPlusUserId']] = 0
    
for _i, d in data_places.iterrows():
    places_price_dict[d['gPlusPlaceId']] = str(d['price'])
    s = str(d['hours'])
    s = re.sub(r'[^\w\s]','',s)
    s = ' '.join( [w for w in s.split() if len(w)>1] )
    places_hours_dict[d['gPlusPlaceId']] = s


In [4]:
userIDs = {}
itemIDs = {}
categoryIDs = {}
yearIDs = {}
priceIDs = {}
hourIDs = {}
educationIDs = {}
jobIDs = {}


interactions = []
interactionsPerUser = defaultdict(list)
userVisitedPlaces = defaultdict(set)
uniquePlaces = set()


cnt = 0
for _i, d in data_reviews.iterrows():
    u = d['gPlusUserId']
    i = d['gPlusPlaceId']
    t = d['unixReviewTime']
    r = d['rating']
    
    _cat = str(d['categories'])
    _year = 'dummy'
    if d['reviewTime'] is not None:
        _year = dateutil.parser.parse(d['reviewTime']).year
    
    _price = places_price_dict[i]
    _hours = places_hours_dict[i]
    _ed = 'dummy'
    if u in user_education:
        _ed = user_education[u]
    _job = 'dummy'
    if u in user_jobs:
        _job = user_jobs[u]
    
    uniquePlaces.add(i)
    
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
        
    if not _cat in categoryIDs: categoryIDs[_cat] = len(categoryIDs)
    if not _year in yearIDs: yearIDs[_year] = len(yearIDs)
    if not _price in priceIDs: priceIDs[_price] = len(priceIDs)
    if not _hours in hourIDs: hourIDs[_hours] = len(hourIDs)
    if not _ed in educationIDs: educationIDs[_ed] = len(educationIDs)
    if not _job in jobIDs: jobIDs[_job] = len(jobIDs)
        
    interactions.append((t,u,i,r, _cat,_year,_price,_hours,_ed,_job))
    interactionsPerUser[u].append((t,i,r,_cat,_year,_price,_hours,_ed,_job))
    userVisitedPlaces[u].add(i)
    
interactions.sort()
userInteractionAvg = sum ([len(interactionsPerUser[u]) for u in interactionsPerUser])  / len(interactionsPerUser)
print(userInteractionAvg)

itemIDs['dummy'] = len(itemIDs)
educationIDs['dummy'] = len(educationIDs)
jobIDs['dummy'] = len(jobIDs)
yearIDs['dummy'] = len(yearIDs)

interactionstrain = []
interactionstest = []
for u in interactionsPerUser:
    interactionsPerUser[u].sort()
    list_users = interactionsPerUser[u]
    lastItem = 'dummy'
    prev_year = 'dummy'
    for (t,i,r, _cat,_year,_price,_hours,_ed,_job) in list_users[:-1]:
        interactionstrain.append((u,i,lastItem, _cat,_year,_price,_hours,_ed,_job, prev_year))
        lastItem = i
        prev_year = _year

    (t,i,r, _cat,_year,_price,_hours,_ed,_job) = list_users[-1]
    interactionstest.append((u,i,lastItem,_cat,_year,_price,_hours,_ed,_job, prev_year))
    lastItem = i
    prev_year = _year

itemsPerUser = defaultdict(set)
for (u,i,lastItem,_cat,_year,_price,_hours,_ed,_job, prev_year) in interactionstrain:
    itemsPerUser[u].add(i)

17.33063829787234


In [5]:
nUsers,nItems,nCat,nYear, nPrice, nHour, nED, nJob = len(userIDs),len(itemIDs),len(categoryIDs), \
                                                        len(yearIDs), len(priceIDs), len(hourIDs), \
                                                            len(educationIDs), len(jobIDs)
items = list(itemIDs.keys())
print(nUsers,nItems,nCat,nYear, nPrice, nHour, nED, nJob)
print(len(interactionstrain))
print(len(interactionstest))

16450 62003 15254 18 5 17605 3 3
268639
16450


In [None]:
#features_list = ['userId_index', 'placeId_index', 'lastplaceId_index', 'cat']
features_sizes = {
    'userId_index': nUsers,
    'placeId_index':nItems,
    'lastplaceId_index':nItems,
     #'cat':nCat
     'year': nYear,
     'prev_year': nYear
#     'price': nPrice,
#     'hour': nHour,
#     'ed': nED,
#     'job': nJob
    
}

next_offset = 0
features_offsets={}
index = 0
for k,v in features_sizes.items():
    features_offsets[index] = next_offset
    index += 1
    next_offset += v
features_offsets

In [None]:
x_train = []
x_train_neg = []

def feat(x):
    return [x[i] + features_offsets[i] for i in range(len(x))]

for (u,i,j,_cat,_year,_price,_hours,_ed,_job, prev_year) in interactionstrain:
    uindex = userIDs[u]
    iindex = itemIDs[i]
    jindex = itemIDs[j]
    cat_index = categoryIDs[_cat]
    year_index = yearIDs[_year]
    price_index = priceIDs[_price]
    hour_index = hourIDs[_hours]
    ed_index = educationIDs[_ed]
    job_index = jobIDs[_job]
    prev_year_index = yearIDs[prev_year]
    
    #x_train.append(feat((uindex, iindex, jindex, cat_index,year_index,price_index,hour_index,ed_index,job_index)))
    #x_train.append(feat((uindex, iindex, jindex,cat_index)))
    
    x_train.append(feat((uindex, iindex, jindex,year_index, prev_year_index)))
    
    k = random.choice(items) # negative sample
    while k in itemsPerUser[u]:
        k = random.choice(items)
    uindex = userIDs[u]
    kindex = itemIDs[k]
    jindex = itemIDs[j] 
    cat_index = categoryIDs[_cat]
    year_index = yearIDs[_year]
    price_index = priceIDs[_price]
    hour_index = hourIDs[_hours]
    ed_index = educationIDs[_ed]
    job_index = jobIDs[_job]
    prev_year_index = yearIDs[prev_year]
    
    x_train_neg.append(feat((uindex, kindex, jindex, year_index, prev_year_index)))
    
    #x_train_neg.append(feat((uindex, kindex, jindex, cat_index)))
    #x_train_neg.append(feat((uindex, kindex, jindex, cat_index,year_index,price_index,hour_index,ed_index,job_index)))


In [None]:
data_x = torch.tensor(x_train)
data_x_neg = torch.tensor(x_train_neg)
dataset = data.TensorDataset(data_x,data_x_neg)

In [None]:
bs=100000
train_n = int(len(dataset)*0.8)
valid_n = len(dataset) - train_n
splits = [train_n,valid_n]
assert sum(splits) == len(dataset)
trainset,devset = torch.utils.data.random_split(dataset,splits)
train_dataloader = data.DataLoader(trainset,batch_size=bs,shuffle=True)
dev_dataloader = data.DataLoader(devset,batch_size=bs,shuffle=True)

In [None]:
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [None]:
class FMModel(nn.Module):
    def __init__(self, n, k):
        super().__init__()

        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)

        with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X_pos, X_neg):
        emb = self.embeddings(X_pos)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X_pos).squeeze().sum(1)
        
        pos = self.w0 + bias + pairwise
        
        emb = self.embeddings(X_neg)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X_neg).squeeze().sum(1)        
        
        neg = self.w0 + bias + pairwise
        loss = -torch.mean(torch.log(torch.sigmoid(pos - neg)))
        return loss
    
    def predict_1(self, X):
        
        emb = self.embeddings(X)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).sum(1)
        
        return self.w0 + bias + pairwise 
    def predict_2(self, X):
        
        emb = self.embeddings(X)
        pow_of_sum = emb.sum(dim=1).pow(2)
        sum_of_pow = emb.pow(2).sum(dim=1)
        pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
        bias = self.bias(X).squeeze().sum(1) 
        
        return self.w0 + bias + pairwise 
        #return pos - neg
    
        #return torch.mean(torch.log(torch.sigmoid(pos - neg)))
    
        #return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_uij - x_ukj)))

In [None]:
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x_pos,x_neg in iterator:
        optimizer.zero_grad()
        loss = model(x_pos, x_neg)
        train_loss += loss.item()*x_pos.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)

def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x_pos,x_neg in iterator:                    
        with torch.no_grad():
            loss = model(x_pos, x_neg)
        train_loss += loss.item()*x_pos.shape[0]
    return train_loss / len(iterator.dataset)

In [None]:
#model = FMModel(max(data_x_neg.max(), data_x.max())+1, 5)
model = FMModel(data_x.max()+1, 5)
wd=1e-5
lr=0.05
epochs=15
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10], gamma=0.1)
criterion = nn.MSELoss()
for epoch in range(epochs):
    start_time = time.time()
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    secs = int(time.time() - start_time)
    print(f'epoch {epoch}. time: {secs}[s]')
    print(f'\ttrain loss: {((train_loss)):.4f}')
    print(f'\tvalidation loss: {((valid_loss)):.4f}')

In [None]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for (u,i,j,_cat,_year,_price,_hours,_ed,_job) in interactionstest:
    interactionsTestPerUser[u].add((i,j,_cat,_year,_price,_hours,_ed,_job))
    itemSet.add(i)
    itemSet.add(j)
    
def AUCu(model, u, N):
    win = 0
    positive = [random.sample(interactionsTestPerUser[u],1)[0]] * N
    negative = random.sample(itemSet,N)
    _pos =  torch.LongTensor()
    _neg =  torch.LongTensor()
    cnt = 0
    for (i,j,_cat,_year,_price,_hours,_ed,_job),k in zip(positive,negative):
#         neg1 = np.array([feat((userIDs[u], itemIDs[k], itemIDs[j], categoryIDs[cat], \
#                                         yearIDs[_year],priceIDs[_price], hourIDs[_hours], \
#                                               educationIDs[_ed], jobIDs[_job]   ))])

        neg1 = np.array([feat((userIDs[u], itemIDs[k],itemIDs[j],categoryIDs[_cat] ))])
        neg1 =  torch.LongTensor(neg1)
        if cnt == 0:
#             pos1 = np.array([feat((userIDs[u], itemIDs[j], itemIDs[j], categoryIDs[cat], \
#                                         yearIDs[_year],priceIDs[_price], hourIDs[_hours], \
#                                               educationIDs[_ed], jobIDs[_job]   ))])
            pos1 = np.array([feat((userIDs[u], itemIDs[i], itemIDs[j], categoryIDs[_cat]))])
            pos1 =  torch.LongTensor(pos1)
            _pos = pos1
            _neg = neg1
            cnt += 1
            continue
        _neg = torch.cat((_neg, neg1))
    sp = model.predict_1(_pos).item()
    n1 = model.predict_2(_neg).detach().numpy()
    win = sum([int (sp > sn) for sn in n1 ])
    return win/N

# def AUCu(model, u, N):
#     win = 0
#     positive = [random.sample(interactionsTestPerUser[u],1)[0]] * N
#     negative = random.sample(itemSet,N)
#     for (i,j,cat,_year,_price,_hours,_ed,_job),k  in zip(positive,negative):
#         pos1 = np.array([feat((userIDs[u], itemIDs[i], itemIDs[j], categoryIDs[cat]))])
#         neg1 = np.array([feat((userIDs[u], itemIDs[k], itemIDs[j], categoryIDs[cat] ))])
#         p1 =  torch.LongTensor(pos1)
#         n1 =  torch.LongTensor(neg1)
#         sp = model.predict_1(p1).item()
#         sn = model.predict_1(n1).item()
#         #sp = model.predict(userIDs[u], itemIDs[i], itemIDs[j])
#         #sn = model.predict(userIDs[u], itemIDs[k], itemIDs[j])
#         if sp > sn:
#             win += 1
#     return win/N
    

def AUC(model):
    av = []
    cnt = 0
    for u in interactionsTestPerUser:
        if cnt % 5000 == 0:
            print(cnt)
        cnt += 1
#         if cnt > 10000:
#             break
        av.append(AUCu(model, u, 10))
    return sum(av) / len(av)
AUC(model)