In [1]:
import numpy as np
import scipy as sy
import pandas as pd

import time
import sys
import os
import ast
import json
import copy 
import itertools

from itertools import islice
from tqdm import *
from math import ceil
from collections import defaultdict
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.sparse import csr_matrix, dok_matrix
from ipynb.fs.full.Random_Sample_Mapper import *

from __future__ import print_function

from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# Read In Data

In [2]:
notebook_path = os.path.abspath("BPR_OPT_Binary_Model.ipynb")
users_items_file_path = os.path.join(os.path.dirname(notebook_path), "data/australian_users_items.json")
items_file_path = os.path.join(os.path.dirname(notebook_path), "data/items_meta_data.json")
users_meta_data_file_path = os.path.join(os.path.dirname(notebook_path), "data/users_meta_data.json")

In [3]:
users_items = []
with open(users_items_file_path, 'r') as data:
    for line in data:
        users_items.append(ast.literal_eval(line))

In [4]:
with open(items_file_path, 'r') as data:
    games_dict = json.load(data)

In [5]:
with open(users_meta_data_file_path, 'r') as file:
    users_meta_data = json.load(file)

# Process Data into Training and Testing Sets

Using default dict for efficient data retrieval for users-items playtime relationship

In [6]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
playtimesPerItem = defaultdict(dict)
playtimesPerUser = defaultdict(dict)
itemNames = defaultdict(str)

In [7]:
for game in games_dict:
    if 'owners' in games_dict[game]:
        usersPerItem[game] = set(games_dict[game]['owners'].keys())

In [8]:
for user in users_items:
    u_id = user['user_id']
    items = [item['item_id'] for item in user['items']]
    itemsPerUser[u_id] = items
    playtimesPerUser[user['user_id']] = dict((item['item_id'], item['playtime_forever']) for item in user['items'])
    for item in user['items']:
        itemNames[item['item_id']] = item['item_name']
        playtimesPerItem[item['item_id']][user['user_id']] = item['playtime_forever']

## Scheduled Sampling with Map-Reduce Algorithm

In [9]:
nUsers = len(itemsPerUser)
nItems = len(usersPerItem)
users = list(itemsPerUser.keys())
items = list(usersPerItem.keys())

In [10]:
user_item_counts = dict((k, len(v)) for k, v in itemsPerUser.items())

In [11]:
datafile = 'data/sample_in.tsv'
mapout1 = 'data/sample_map1.tsv'
mapout2 = 'data/sample_map2.tsv'
outfile = 'data/sample_out.tsv'

f = open(datafile,'w')
for u, its in itemsPerUser.items():
    for i in its:
        print(default_formatter(u,i), file=f)
f.close()

In [12]:
# run two stages of mapreduce
mapper = Mapper(user_item_counts)
mapreduce(datafile, mapout1, mapper=mapper, reducer=reducer)
mapreduce(datafile, mapout2, mapper=indicator_mapper)  # map the data again
mapreduce([mapout1, mapout2], outfile, reducer=indicator_reducer)

In [13]:
def trim(u, i, j):
    return u[1:len(u)-1], i[2:len(i)-2], j[1:len(j)-2]

In [14]:
def create_data(filepath):
    f = open(filepath)
    samples = [map(str, line.strip().split()) for line in f]
    return [trim(u, i, j) for u, i, j in samples]

In [15]:
def create_random_batches(data, batch_size=1024):
    batches = []
    random.shuffle(data)
    num_batches = int(len(data)/batch_size)
    for i in range(num_batches):
        mini_batch = data[i*batch_size:(i+1)*batch_size]
        batches.append(mini_batch)
    return batches

In [9]:
data = create_data(outfile)

NameError: name 'create_data' is not defined

In [None]:
sample_data = train_test_split(data, train_size=0.05)[0]

In [None]:
sampleUsersPerItem = defaultdict(set)
sampleItemsPerUser = defaultdict(set)

In [None]:
for u, i, j in sample_data:
    sampleUsersPerItem[i].add(u)
    sampleItemsPerUser[u].add(i)

In [None]:
train_data = sample_data
sampleItems = list(sampleUsersPerItem.keys())
sampleUsers = list(sampleItemsPerUser.keys())

In [None]:
test_data = []
for u in sampleItemsPerUser:
    userItems = sampleItemsPerUser[u].union(itemsPerUser[u])
    if len(items) == 0: continue
    for i in sampleItemsPerUser[u]:
        userItems.remove(i)
        j = random.choice(sampleItems)
        userItems.add(i)
        test_data.append((u, i, j))

# Utility Functions

In [22]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [23]:
def binary_label(u, i, j):
    c = Counter(itemsPerUser[u])
    if c[i] >= c[j]:
        return 1
    else:
        return 0

In [24]:
def generate_outputs(sample):
    predictions = []
    labels = []
    for u, i, j in sample:
        predict = sigmoid(prediction(u, i, j))
        label = binary_label(u, i, j)
        predictions.append(predict)
        labels.append(label)
            
    return np.rint(predictions), labels

In [25]:
def accuracy(predictions, labels):
    differences = [1 if x == y else 0 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [26]:
train_labels = [binary_label(u, i, j) for u, i, j in train_data]

## Sigmoid Function

\begin{equation*}
\sigma(x) = \frac{1}{1 + e^{-x}}
\end{equation*}

In [None]:
def sigmoid(x):
    #Numerically stable sigmoid function.
    #Taken from: https://timvieira.github.io/blog/post/2014/02/11/exp-normalize-trick/
    if x >= 0:
        z = np.exp(-x)
        return 1 / (1 + z)
    else:
        # if x is less than zero then z will be small, denom can't be
        # zero because it's 1+z.
        z = np.exp(x)
        return z / (1 + z)

# Simple (Biase Only) Latent Factor Model with Binary Classification

In [None]:
loss_history = []
train_accuracy_history = []

In [None]:
itemBiases = defaultdict(float)

In [None]:
def unpack(theta):
    global itemBiases
    itemBiases = dict(zip(items, theta))

## Prediction Function

\begin{equation*}
f(i, j) = \beta_i - \beta_j
\end{equation*}

\begin{equation*}
p(i >_u j) = \sigma(f(i, j))
\end{equation*}

In [None]:
def prediction(u, item_i, item_j):
    return itemBiases[item_i] - itemBiases[item_j]

\begin{equation*}
\text{Cost Function (arg min)}:= \sum_{u,i,j} -\ln(\sigma(\beta_i - \beta_j)) + \lambda \sum_i \beta_i^2 = \sum_{u,i,j} -ln\left( \frac{1}{1 + e^{\beta_j - \beta_i}} \right) + \lambda \sum_i \beta_i^2
\end{equation*}

## Cost Function

In [None]:
def cost(theta, lamb):
    unpack(theta)
    cost = 0.0
    predictions = []
    for u, i, j in train_data:
        x = prediction(u, i, j)
        predictions.append(sigmoid(x))
        cost += np.log(sigmoid(x))

    for i in itemBiases:
        cost -= lamb*itemBiases[i]**2

    train_accuracy = accuracy(np.rint(predictions), train_labels)
    loss_history.append(-cost)
    train_accuracy_history.append(train_accuracy)
    print('iteration {0} Cost: {1}'.format(len(loss_history), -cost))
    print('iteration {0} Training Accuracy: {1}'.format(len(train_accuracy_history), train_accuracy))
    print('-------------------------------------------------------------------')

    return -cost

\begin{equation*}
\frac{\partial }{\partial x} ln\sigma(x) = \frac{1}{1 + e^x} = \sigma(-x)
\end{equation*}

## Partial Derivatives

\begin{equation*}
\frac{\partial f}{\partial \beta_i} = -\frac{e^{\beta_j - \beta_i}}{1 + e^{\beta_j - \beta_i}} + 2 \lambda \beta_j = -\frac{1}{1 + e^{\beta_i - \beta_j}} + 2 \lambda \beta_j
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_j} = \frac{e^{\beta_j - \beta_i}}{1 + e^{\beta_j - \beta_i}} + 2 \lambda \beta_j = \frac{1}{1 + e^{\beta_i - \beta_j}} + 2 \lambda \beta_j
\end{equation*}

### [Important]: switch the sign of all partial derivatives to compute your gradient ascent

In [None]:
def derivative(theta, lamb):
    unpack(theta)
    dItemBiases = defaultdict(float)
    for u, i, j in train_data:
        x = prediction(u, i, j)
        dbase = 1 / (1 + np.exp(x)) # negative gradient descent for maximizing
        dItemBiases[i] += -dbase
        dItemBiases[j] += dbase
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dItemBiases[i] for i in items]
    return np.array(dtheta)

In [None]:
res, f, d = sy.optimize.fmin_l_bfgs_b(cost, [0.0]*nItems, derivative, args=[0.001])

In [None]:
d

In [None]:
unpack(res)

In [None]:
plt.plot([x for x in range(len(loss_history))], loss_history, linewidth=2.0)
plt.title('Biase Only Loss History')
plt.show()

In [None]:
plt.plot([x for x in range(len(train_accuracy_history))], train_accuracy_history, linewidth=2.0)
plt.title('Biase Only Training Accuracy History')
plt.show()

In [None]:
predictions, labels = generate_outputs(test_data)

In [None]:
accuracy(predictions, labels)

# Complete Latent Factor Model with Binary Classification

A simple non-biased latent factor model that is wrapped into a binary function (sigmoid function) as a base line model, using popularity as the item's sole feature

In [None]:
loss_history = []
train_accuracy_history = []

In [None]:
itemBiases = defaultdict(float)

For each user and item we now have a low dimensional descriptor (representing that user's preferences, and that item's properties), of dimension K.

In [None]:
userGamma = {}
itemGamma = {}

In [None]:
K = 10

In [None]:
for u in itemsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [None]:
for i in usersPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

We'll use another library in this example to perform gradient descent. This library requires that we pass it a "flat" parameter vector (theta) containing all of our parameters. This utility function just converts between a flat feature vector, and our model parameters, i.e., it "unpacks" theta into our offset and bias parameters.

In [None]:
def unpack(theta):
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    itemBiases = dict(zip(items, theta[0:index + nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index + K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index + K]
        index += K

## Prediction Function

\begin{equation*}
f(u, i, j) = \gamma_u \gamma_i + \beta_i - (\gamma_u \gamma_j + \beta_j)
\end{equation*}

\begin{equation*}
p(i >_u j) = \sigma(f(u, i, j))
\end{equation*}

In [None]:
def prediction(user, item_i, item_j):
    return inner(userGamma[user], itemGamma[item_i]) + itemBiases[item_i] - (inner(userGamma[user], itemGamma[item_j]) + itemBiases[item_j]) 

## Cost Function

\begin{equation*}
\text{Cost Function (arg min)}:= \sum_{u,i,j} -\ln(\sigma(\gamma_u \gamma_i + \beta_i - (\gamma_u \gamma_j + \beta_j))) + \lambda [\sum_i \beta_i^2 + \sum_i ||\gamma_i||_2^2 + \sum_u ||\gamma_u||_2^2 ]
\end{equation*}

\begin{equation*}
\sum_{u,i,j} \ln(\sigma(\gamma_u \gamma_i + \beta_i - (\gamma_u \gamma_j + \beta_j))) = \sum_{u,i,j} ln\left( \frac{1}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} \right)
\end{equation*}

In [None]:
def cost(theta, lamb):
    unpack(theta)
    cost = 0.0
    predictions = []
    for u, i, j in train_data:
        x = prediction(u, i, j)
        predictions.append(sigmoid(x))
        cost += np.log(sigmoid(x))
        
    for u in users:
        for k in range(K):
            cost -= lamb*userGamma[u][k]**2
    for i in items:
        cost -= lamb*itemBiases[i]**2
        for k in range(K):
            cost -= lamb*itemGamma[i][k]**2
        
    train_accuracy = accuracy(np.rint(predictions), train_labels)
    loss_history.append(-cost)
    train_accuracy_history.append(train_accuracy)
    print('iteration {0} Cost: {1}'.format(len(loss_history), -cost))
    print('iteration {0} Training Accuracy: {1}'.format(len(train_accuracy_history), train_accuracy))
    print('-------------------------------------------------------------------')
        
    return -cost

## Partial Derivatives

\begin{equation*}
\frac{\partial f}{\partial \gamma_{u,k}} = \frac{(\gamma_{j,k} - \gamma_{i,k}) \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} + 2 \lambda \gamma_{i,k} 
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \gamma_{i,k}} = -\frac{\gamma_{u,k} \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} + 2 \lambda \gamma_{i,k}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \gamma_{j,k}} = \frac{\gamma_{u,k} \cdot e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} + 2 \lambda \gamma_{j,k}
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_i} = -\frac{e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} + 2 \lambda \beta_i
\end{equation*}

\begin{equation*}
\frac{\partial f}{\partial \beta_j} = \frac{e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}}{1 + e^{\gamma_u \gamma_j + \beta_j - (\gamma_u \gamma_i + \beta_i)}} + 2 \lambda \beta_j
\end{equation*}

In [None]:
def derivative(theta, lamb):
    unpack(theta)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in users:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in items:
        dItemGamma[i] = [0.0 for k in range(K)]
    for u, i, j in train_data:
        x = prediction(u, i ,j)
        dbase = 1 / (1 + np.exp(x))
        dItemBiases[i] += -dbase
        dItemBiases[j] += dbase
        for k in range(K):
            dUserGamma[u][k] += (itemGamma[j][k] - itemGamma[i][k]) * dbase
            dItemGamma_k = userGamma[u][k] * dbase
            dItemGamma[i][k] += -dItemGamma_k
            dItemGamma[j][k] += dItemGamma_k
    for u in userGamma:
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return np.array(dtheta)

In [None]:
complete_res, complete_f, complete_d = sy.optimize.fmin_l_bfgs_b(cost, 
                                [0.0]*nItems + # Initialize beta
                                [random.random() * 0.1 - 0.05 for k in range(K*(nUsers + nItems))], # Gamma
                                derivative, args=[0.001])

In [None]:
complete_d

In [None]:
unpack(complete_res)

In [None]:
predictions, labels = generate_outputs(test_data)

In [None]:
accuracy(predictions, labels)

In [None]:
predictions, labels = generate_outputs(train_data)

In [None]:
accuracy(predictions, labels)

In [None]:
plt.plot([x for x in range(len(loss_history))], loss_history, linewidth=2.0)
plt.title('BPR Loss History')
plt.show()

In [None]:
plt.plot([x for x in range(len(train_accuracy_history))], train_accuracy_history, linewidth=2.0)
plt.title('BPR Training Accuracy History')
plt.show()

In [None]:
np.save('./data/itemGamma', itemGamma)
np.save('./data/itemBiases', itemBiases)
np.save('./data/userGamma', userGamma)

In [10]:
itemGamma = np.load('./data/itemGamma.npy').item()
itemBiases = np.load('./data/itemBiases.npy').item()
userGamma = np.load('./data/userGamma.npy').item()

In [None]:
def predict_user(user):
    """
    returns the predicted ratings for the specified user,
    this is mainly used in computing evaluation metric
    """
    user_pred = []
    labels = []
    copy_items = copy.deepcopy(items)
    for i in itemsPerUser[user]:
        copy_items.remove(i)
        j = random.choice(copy_items)
        predict = sigmoid(prediction(user, i, j))
        copy_items.append(i)
        user_pred.append(predict)
        labels.append(binary_label(u, i, j))
        
    return np.array(labels), np.array(user_pred)

In [None]:
def auc_score():
    score = 0.0
    n_user = 0
    for user in tqdm(users):
        if len(itemsPerUser[user]) == 0: continue
        y_true, y_pred = predict_user(user)
        n_user += 1
        if len(np.unique(y_true)):
            score += accuracy_score(y_true, np.rint(y_pred))
        else:
            score += roc_auc_score(y_true, y_pred)
    score /= n_user
    return score

In [None]:
auc_score()

In [11]:
sample_user_factors = np.array([userGamma[u] for u in sampleItemsPerUser])
sample_item_factors = np.array([itemGamma[i] for i in sampleUsersPerItem])
sample_item_biases = np.array([itemBiases[u] for u in sampleUsersPerItem])

NameError: name 'sampleItemsPerUser' is not defined

In [None]:
feat_cols = ["Played", "Purchased", "New"]
df = pd.DataFrame(sample_item_factors.T ,columns=feat_cols)

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(sample_item_factors)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 5368 samples in 0.012s...
[t-SNE] Computed neighbors for 5368 samples in 0.971s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5368
[t-SNE] Computed conditional probabilities for sample 2000 / 5368
[t-SNE] Computed conditional probabilities for sample 3000 / 5368
[t-SNE] Computed conditional probabilities for sample 4000 / 5368
[t-SNE] Computed conditional probabilities for sample 5000 / 5368
[t-SNE] Computed conditional probabilities for sample 5368 / 5368
[t-SNE] Mean sigma: 0.000000


In [None]:
df = pd.DataFrame(tsne_results, columns=["tsne-2d-one", "tsne-2d-two"])
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df,
    legend="full",
    alpha=0.3
)