In [None]:
import numpy as np
import scipy as sy
import matplotlib.pyplot as plt
import time
import sys
import os
import ast
import json
import copy 
import itertools
from tqdm import *
from collections import defaultdict
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from ipynb.fs.full.Random_Sample_Mapper import *

In [None]:
notebook_path = os.path.abspath("BPR_OPT_Binary_Model.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")
users_meta_data_file_path = os.path.join(os.path.dirname(notebook_path), "data/users_meta_data.json")

In [None]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [None]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [None]:
with open(users_meta_data_file_path, 'r') as file:
    users_meta_data = json.load(file)

In [None]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(dict)
playtimesPerUser = defaultdict(dict)
itemNames = defaultdict(str)

In [None]:
train_usersPerItem = defaultdict(set)
train_itemsPerUser = defaultdict(set)
test_usersPerItem = defaultdict(set)
test_itemsPerUser = defaultdict(set)

In [None]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())

In [None]:
for user in users_items:
    u_id = user['user_id']
    items = [item['item_id'] for item in user['items']]
    itemsPerUser[u_id] = items
    
    items_train, items_test = train_test_split(items, test_size = 0.20)
    train_itemsPerUser[u_id] = items_train
    test_itemsPerUser[u_id] = items_test

    playtimesPerUser[user['user_id']] = dict((item['item_id'], item['playtime_forever']) for item in user['items'])
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']
        playtimesPerItem[item['item_id']][user['user_id']] = item['playtime_forever']

In [None]:
for user in train_itemsPerUser:
    for item in train_itemsPerUser[user]:
        train_usersPerItem[item].add(user)
        
    for item in test_itemsPerUser[user]:
        test_usersPerItem[item].add(user)

In [None]:
nUsers = len(itemsPerUser)
nItems = len(usersPerItem)
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())

In [None]:
train_user_item_counts = dict((k, len(v)) for k, v in train_itemsPerUser.items())

In [None]:
test_user_item_counts = dict((k, len(v)) for k, v in test_itemsPerUser.items())

In [None]:
train_datafile = 'data/train_sample_in.tsv'
train_mapout1 = 'data/train_sample_map1.tsv'
train_mapout2 = 'data/train_sample_map2.tsv'
train_outfile = 'data/train_sample_out.tsv'

f = open(train_datafile,'w')
for u, its in train_itemsPerUser.items():
    for i in its:
        print(default_formatter(u,i), file=f)
f.close()

In [None]:
# run two stages of mapreduce
train_mapper = Mapper(train_user_item_counts)
mapreduce(train_datafile, train_mapout1, mapper=train_mapper, reducer=reducer)
mapreduce(train_datafile, train_mapout2, mapper=indicator_mapper)  # map the data again
mapreduce([train_mapout1, train_mapout2], train_outfile, reducer=indicator_reducer)

In [None]:
test_sample = []

for u, its in test_itemsPerUser.items():
    ii = list(train_itemsPerUser[u])
    if len(ii) < 1: continue
    for j in its:
        ii = list(train_itemsPerUser[u])
        if len(ii) < 1: continue
        i = random.choice(ii)
        test_sample.append((u, i, j))

In [None]:
class ExternalSchedule(object):

    def __init__(self, filepath):
        self.filepath = filepath
        
    def generate_random_samples(self, size=None, rand=True):
        f = open(self.filepath)
        samples = [map(str, line.strip().split()) for line in f]
        if rand:
            random.shuffle(samples)  # important!
        if size is None:
            size = len(samples)
        for u, i, j in samples[:size]:
            yield u, i, j

In [None]:
sampler = ExternalSchedule(train_outfile)  # schedule is one-indexed

In [None]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [None]:
def trim(u, i, j):
    return u[1:len(u)-1], i[2:len(i)-2], j[1:len(j)-2]

In [None]:
def binary_label(u, i, j):
    c = Counter(itemsPerUser[u])
    return c[j]

In [None]:
def accuracy(sample):
    predictions = []
    labels = []
    for u, i, j in sample:
        predict = sigmoid(prediction(u, i, j))
        predictions.append(predict)
        label = binary_label(u, i, j)
        labels.append(label)
    
    return accuracy_score(labels, np.rint(predictions))

## Sigmoid Function

\begin{equation*}
\sigma(x) = \frac{1}{1 + e^{-x}}
\end{equation*}

In [None]:
def sigmoid(x):
    #Numerically stable sigmoid function.
    #Taken from: https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
    if x >= 0:
        z = np.exp(-x)
        return 1 / (1 + z)
    else:
        # if x is less than zero then z will be small, denom can't be
        # zero because it's 1+z.
        z = np.exp(x)
        return z / (1 + z)

# Simple (Biase Only) Latent Factor Model with Binary Classification

In [None]:
itemBiases = defaultdict(float)

In [None]:
def unpack(theta):
    global itemBiases
    itemBiases = dict(zip(items, theta))

## Prediction Function

\begin{equation*}
f(i, j) = \beta_i - \beta_j
\end{equation*}

\begin{equation*}
p(i >_u j) = \sigma(f(i, j))
\end{equation*}

In [None]:
def prediction(u, item_i, item_j):
    return itemBiases[item_i] - itemBiases[item_j]

\begin{equation*}
\text{BPR-OPT} := \text{argmax} \ln(\sigma(\beta_i - \beta_j))
\end{equation*}

\begin{equation*}
\text{Cost Function}:= \sum_{u,i,j} ln\left( \frac{1}{1 + e^{\beta_j - \beta_i}} \right)
\end{equation*}

## Cost Function

In [None]:
def cost(theta):
    unpack(theta)
    cost = 0
    sample = []
    predictions = []
    labels = []
    for u, i, j in sampler.generate_random_samples():
        u, i, j = trim(u, i ,j)
        sample.append((u, i, j))
        x = prediction(u, i, j)
        predictions.append(sigmoid(x))
        labels.append(binary_label(u, i, j))

        cost += (1 - sigmoid(x))
        
    print(-cost)
    print(accuracy_score(labels, np.rint(predictions)))
    
    return -cost

\begin{equation*}
\frac{\partial }{\partial x} ln\sigma(x) = \frac{1}{1 + e^x} = \sigma(-x)
\end{equation*}

## Partial Derivatives

\begin{equation*}
\frac{\partial f}{\partial \beta_i} = \frac{e^{\beta_j - \beta_i}}{1 + e^{\beta_j - \beta_i}} = \frac{1}{1 + e^{\beta_i - \beta_j}}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_j} = -\frac{e^{\beta_j - \beta_i}}{1 + e^{\beta_j - \beta_i}} = -\frac{1}{1 + e^{\beta_i - \beta_j}}
\end{equation*}

In [None]:
def derivative(theta):
    unpack(theta)
    dItemBiases = defaultdict(float)
    for u, i, j in sampler.generate_random_samples():
        u, i, j = trim(u, i ,j)
        x = prediction(u, i, j)
        dbase = 1 / (1 + np.exp(x))
        dItemBiases[i] += dbase
        dItemBiases[j] -= dbase
    dtheta = [dItemBiases[i] for i in items]
    return np.array(dtheta)

In [None]:
res, f, d = sy.optimize.fmin_l_bfgs_b(cost, [0.0]*nItems, derivative)

In [None]:
d

In [None]:
unpack(res)

# Complete Latent Factor Model with Binary Classification

A simple non-biased latent factor model that is wrapped into a binary function (sigmoid function) as a base line model, using popularity as the item's sole feature

In [None]:
itemBiases = defaultdict(float)

For each user and item we now have a low dimensional descriptor (representing that user's preferences, and that item's properties), of dimension K.

In [None]:
userGamma = {}
itemGamma = {}

In [None]:
K = 2

In [None]:
for u in itemsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [None]:
for i in usersPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

We'll use another library in this example to perform gradient descent. This library requires that we pass it a "flat" parameter vector (theta) containing all of our parameters. This utility function just converts between a flat feature vector, and our model parameters, i.e., it "unpacks" theta into our offset and bias parameters.

In [None]:
def unpack(theta):
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    itemBiases = dict(zip(items, theta[0:index + nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index + K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index + K]
        index += K

## Prediction Function

\begin{equation*}
f(u, i, j) = \gamma_u \gamma_i + \beta_i - (\gamma_u \gamma_j + \beta_j)
\end{equation*}

\begin{equation*}
p(i >_u j) = \sigma(f(u, i, j))
\end{equation*}

In [None]:
def prediction(user, item_i, item_j):
    return inner(userGamma[user], itemGamma[item_i]) + itemBiases[item_i] - (inner(userGamma[user], itemGamma[item_j]) + itemBiases[item_j]) 

## Cost Function

\begin{equation*}
\text{BPR-OPT} := \text{argmax} \ln(\sigma(\gamma_u \gamma_i + \beta_i - (\gamma_u \gamma_j + \beta_j)))
\end{equation*}

\begin{equation*}
\text{Cost Function}:= \sum_{u,i,j} ln\left( \frac{1}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} \right)
\end{equation*}

In [None]:
def cost(theta):
    unpack(theta)
    cost = 0
    predictions = []
    labels = []
    for u, i, j in sampler.generate_random_samples(rand=False):
        u, i, j = trim(u, i ,j)
        x = prediction(u, i, j)
        predictions.append(sigmoid(x))
        labels.append(binary_label(u, i, j))

        cost += (1 / (1 + np.exp(x)))
        
    print('Current Cost: %s' % cost)
    print('Current Accuracy: %s' % accuracy_score(labels, np.rint(predictions)))
        
    return cost

## Partial Derivatives

\begin{equation*}
\frac{\partial f}{\partial \gamma_{u,k}} = \frac{(\gamma_{i,k} - \gamma_{j,k}) \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \gamma_{i,k}} = \frac{\gamma_{u,k} \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \gamma_{j,k}} = -\frac{\gamma_{u,k} \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_i} = \frac{e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_j} = -\frac{e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}
\end{equation*}

In [None]:
def derivative(theta):
    unpack(theta)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in users:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in items:
        dItemGamma[i] = [0.0 for k in range(K)]
    for u, i, j in sampler.generate_random_samples(rand=False):
        u, i, j = trim(u, i ,j)
        x = prediction(u, i ,j)
        dbase = 1 / (1 + np.exp(x))
        dItemBiases[i] += dbase
        dItemBiases[j] -= dbase
        for k in range(K):
            dUserGamma[u][k] += (itemGamma[i][k] - itemGamma[j][k]) * dbase
            dItemGamma_k = userGamma[u][k] * dbase
            dItemGamma[i][k] += dItemGamma_k
            dItemGamma[j][k] -= dItemGamma_k
    dtheta = [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return np.array(dtheta)

In [None]:
complete_res, f, d = sy.optimize.fmin_l_bfgs_b(cost, [0.0]*nItems + # Initialize beta
                                [random.random() * 0.1 - 0.05 for k in range(K*(nUsers + nItems))], # Gamma
                             derivative)

In [None]:
d

In [None]:
unpack(complete_res)