In [1]:
import json
import pickle as pkl
import operator
import time
from collections import Counter
from itertools import product
import random

import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

import psycopg2 as pg

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank

%run '../lib/cookbook/recsys.py'
%run '../lib/cookbook/generic_preprocessing.py'
%run '../lib/utility.py'

import matplotlib.pyplot as plt
from IPython.display import HTML ## Setting display options for Ipython Notebook

# Analysis

# Comparing No Free Games vs All Games models

### Reload Data

In [2]:
## All Games
user_top_games_all = pkl.load(open('../data/preprocessed_data/all_games/user_top_games.pkl', 'rb'))
user_top_games_filtered_hours_all = pkl.load(open('../data/preprocessed_data/all_games/user_top_games_filtered_hours.pkl', 'rb'))
user_top_games_filtered_hours_0_all = pkl.load(open('../data/preprocessed_data/all_games/user_top_games_filtered_hours_0.pkl','rb'))
user_top_games_filtered_percentile_all = pkl.load(open('../data/preprocessed_data/all_games/user_top_games_filtered_percentile.pkl', 'rb'))
games_all = pkl.load(open('../data/preprocessed_data/all_games/games.pkl', 'rb'))

In [3]:
## No Free Games
user_top_games = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games.pkl','rb'))
user_top_games_filtered_hours = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl','rb'))
user_top_games_filtered_hours_0 = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours_0.pkl','rb'))
user_top_games_filtered_percentile = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl','rb'))
games = pkl.load(open('../data/preprocessed_data/no_free_games/games.pkl', 'rb'))

### All Games

In [5]:
auc_ag = []
for i in range(30):
    # train test split
    train_val, test = train_test_split(user_top_games_all, test_size=0.2)
    interactions_train_all_ag = create_interaction_matrix(df = train_val,
                                                     user_col = 'user',
                                                     item_col = 'item',
                                                     rating_col = 'rating',
                                                     threshold = '1')
    interactions_test_ag = create_interaction_matrix(df = test,
                                             user_col = 'user',
                                             item_col = 'item',
                                             rating_col = 'rating',
                                             threshold = '1')
    sparse_train_all_ag = sparse.csr_matrix(interactions_train_all_ag.values)
    sparse_test_ag = sparse.csr_matrix(interactions_test_ag.values)
    # query best parameters
    sqlalchemy_conn = create_sqlalchemy_connection('sqlalchemy_conn_str.txt')
    best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')
    # train and record auc
    model = LightFM(**best_parameters)
    model.fit(sparse_train_all_ag,
            epochs=15,
            num_threads=30)
    auc = auc_score(model, sparse_test_ag, sparse_train_all_ag, num_threads=30).mean()
    auc_ag.append(auc)

In [7]:
np.mean(auc_ag)

0.9089915

### Free Games Removed

In [8]:
auc_nf = []
for i in range(30):
    train_val, test = train_test_split(user_top_games, test_size=0.2)
    interactions_train_all_nf = create_interaction_matrix(df = train_val,
                                                     user_col = 'user',
                                                     item_col = 'item',
                                                     rating_col = 'rating',
                                                     threshold = '1')
    interactions_test_nf = create_interaction_matrix(df = test,
                                             user_col = 'user',
                                             item_col = 'item',
                                             rating_col = 'rating',
                                             threshold = '1')
    ## create sparse matrices from interactions
    sparse_train_all_nf = sparse.csr_matrix(interactions_train_all_nf.values)
    sparse_test_nf = sparse.csr_matrix(interactions_test_nf.values)
    # train and record auc
    model = LightFM(**best_parameters)
    model.fit(sparse_train_all_nf,
            epochs=15,
            num_threads=30)
    auc = auc_score(model, sparse_test_nf, sparse_train_all_nf, num_threads=30).mean()
    auc_nf.append(auc)

In [12]:
np.mean(auc_nf)

0.9139857

# Comparing Model 1, 2, 3, and 4

In [78]:
## No Free Games
user_top_games = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games.pkl','rb'))
user_top_games_filtered_hours = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl','rb'))
user_top_games_filtered_hours_0 = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours_0.pkl','rb'))
user_top_games_filtered_percentile = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl','rb'))

In [79]:
sqlalchemy_conn = create_sqlalchemy_connection('sqlalchemy_conn_str.txt')
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')

### Model 1 

In [12]:
train_val_1, test_1 = train_test_split(user_top_games, test_size=0.2, random_state=1337)

In [13]:
interactions_train_all_1 = create_interaction_matrix(df = train_val_1,
                                                 user_col = 'user',
                                                 item_col = 'item',
                                                 rating_col = 'rating',
                                                 threshold = '1')
sparse_train_1 = sparse.csr_matrix(interactions_train_all_1.values)

In [14]:
interactions_test_1 = create_interaction_matrix(df = test_1,
                                         user_col = 'user',
                                         item_col = 'item',
                                         rating_col = 'rating',
                                         threshold = '1')
sparse_test_1 = sparse.csr_matrix(interactions_test_1.values)

In [15]:
model_1 = LightFM(**best_parameters)
model_1.fit(sparse_train_1,
            epochs=15,
            num_threads=30)

<lightfm.lightfm.LightFM at 0x7f7dcf79b6d8>

### Model 2

In [16]:
user_top_games_filtered_hours_0 = user_top_games.merge(user_top_games_filtered_hours_0, how='left', on=['user','item'])
user_top_games_filtered_hours_0.loc[user_top_games_filtered_hours_0['rating_y'].isnull(), 'rating_y'] = 0
user_top_games_filtered_hours_0.rename(columns={'rating_y': 'rating'}, inplace=True)
user_top_games_filtered_hours_0.drop(columns='rating_x',inplace=True)

In [17]:
train_val_2 = user_top_games_filtered_hours_0.loc[user_top_games_filtered_hours_0.index.isin(train_val_1.index)]
test_2 = user_top_games_filtered_hours_0.loc[user_top_games_filtered_hours_0.index.isin(test_1.index)]

In [18]:
interactions_train_all_2 = create_interaction_matrix(df = train_val_2,
                                                     user_col = 'user',
                                                     item_col = 'item',
                                                     rating_col = 'rating',
                                                     threshold = '1')
sparse_train_2 = sparse.csr_matrix(interactions_train_all_2.values)

In [19]:
interactions_test_2 = create_interaction_matrix(df = test_2,
                                         user_col = 'user',
                                         item_col = 'item',
                                         rating_col = 'rating',
                                         threshold = '1')
sparse_test_2 = sparse.csr_matrix(interactions_test_2.values)

In [20]:
model_2 = LightFM(**best_parameters)
model_2.fit(sparse_train_2,
            epochs=15,
            num_threads=30)

<lightfm.lightfm.LightFM at 0x7f7dc0e7c2e8>

### Model 3

In [21]:
user_top_games_filtered_hours = user_top_games.merge(user_top_games_filtered_hours, how='left', on=['user','item'])
user_top_games_filtered_hours.loc[user_top_games_filtered_hours['rating_y'].isnull(), 'rating_y'] = 0
user_top_games_filtered_hours.rename(columns={'rating_y': 'rating'}, inplace=True)
user_top_games_filtered_hours.drop(columns='rating_x',inplace=True)

In [22]:
train_val_3 = user_top_games_filtered_hours.loc[user_top_games_filtered_hours.index.isin(train_val_1.index)]
test_3 = user_top_games_filtered_hours.loc[user_top_games_filtered_hours.index.isin(test_1.index)]

In [23]:
interactions_train_all_3 = create_interaction_matrix(df = train_val_3,
                                                     user_col = 'user',
                                                     item_col = 'item',
                                                     rating_col = 'rating',
                                                     threshold = '1')
sparse_train_3 = sparse.csr_matrix(interactions_train_all_3.values)

In [24]:
interactions_test_3 = create_interaction_matrix(df = test_3,
                                         user_col = 'user',
                                         item_col = 'item',
                                         rating_col = 'rating',
                                         threshold = '1')
sparse_test_3 = sparse.csr_matrix(interactions_test_3.values)

In [25]:
model_3 = LightFM(**best_parameters)
model_3.fit(sparse_train_3,
            epochs=15,
            num_threads=30)

<lightfm.lightfm.LightFM at 0x7f7dc133a4a8>

### Model 4

In [26]:
user_top_games_filtered_percentile = user_top_games.merge(user_top_games_filtered_percentile, how='left', on=['user','item'])
user_top_games_filtered_percentile.loc[user_top_games_filtered_percentile['rating_y'].isnull(), 'rating_y'] = 0
user_top_games_filtered_percentile.rename(columns={'rating_y': 'rating'}, inplace=True)
user_top_games_filtered_percentile.drop(columns='rating_x',inplace=True)

In [27]:
train_val_4 = user_top_games_filtered_percentile.loc[user_top_games_filtered_percentile.index.isin(train_val_1.index)]
test_4 = user_top_games_filtered_percentile.loc[user_top_games_filtered_percentile.index.isin(test_1.index)]

In [28]:
interactions_train_all_4 = create_interaction_matrix(df = train_val_4,
                                                     user_col = 'user',
                                                     item_col = 'item',
                                                     rating_col = 'rating',
                                                     threshold = '1')
sparse_train_4 = sparse.csr_matrix(interactions_train_all_4.values)

In [29]:
interactions_test_4 = create_interaction_matrix(df = test_4,
                                         user_col = 'user',
                                         item_col = 'item',
                                         rating_col = 'rating',
                                         threshold = '1')
sparse_test_4 = sparse.csr_matrix(interactions_test_4.values)

In [30]:
model_4 = LightFM(**best_parameters)
model_4.fit(sparse_train_4,
            epochs=15,
            num_threads=30)

<lightfm.lightfm.LightFM at 0x7f7dce85b550>

## Compare performance on each other's interactions

In [31]:
auc_1_1 = auc_score(model_1, sparse_test_1, sparse_train_1, num_threads=30).mean()
auc_1_2 = auc_score(model_1, sparse_test_2, sparse_train_2, num_threads=30).mean()
auc_1_3 = auc_score(model_1, sparse_test_3, sparse_train_3, num_threads=30).mean()
auc_1_4 = auc_score(model_1, sparse_test_4, sparse_train_4, num_threads=30).mean()

auc_2_1 = auc_score(model_2, sparse_test_1, sparse_train_1, num_threads=30).mean()
auc_2_2 = auc_score(model_2, sparse_test_2, sparse_train_2, num_threads=30).mean()
auc_2_3 = auc_score(model_2, sparse_test_3, sparse_train_3, num_threads=30).mean()
auc_2_4 = auc_score(model_2, sparse_test_4, sparse_train_4, num_threads=30).mean()

auc_3_1 = auc_score(model_3, sparse_test_1, sparse_train_1, num_threads=30).mean()
auc_3_2 = auc_score(model_3, sparse_test_2, sparse_train_2, num_threads=30).mean()
auc_3_3 = auc_score(model_3, sparse_test_3, sparse_train_3, num_threads=30).mean()
auc_3_4 = auc_score(model_3, sparse_test_4, sparse_train_4, num_threads=30).mean()

auc_4_1 = auc_score(model_4, sparse_test_1, sparse_train_1, num_threads=30).mean()
auc_4_2 = auc_score(model_4, sparse_test_2, sparse_train_2, num_threads=30).mean()
auc_4_3 = auc_score(model_4, sparse_test_3, sparse_train_3, num_threads=30).mean()
auc_4_4 = auc_score(model_4, sparse_test_4, sparse_train_4, num_threads=30).mean()

In [32]:
print('model 1 embeddings vs model 1 interactions auc: '+str(auc_1_1))
print('model 1 embeddings vs model 2 interactions auc: '+str(auc_1_2))
print('model 1 embeddings vs model 3 interactions auc: '+str(auc_1_3))
print('model 1 embeddings vs model 4 interactions auc: '+str(auc_1_4)+'\n')

print('model 2 embeddings vs model 1 interactions auc: '+str(auc_2_1))
print('model 2 embeddings vs model 2 interactions auc: '+str(auc_2_2))
print('model 2 embeddings vs model 3 interactions auc: '+str(auc_2_3))
print('model 2 embeddings vs model 4 interactions auc: '+str(auc_2_4)+'\n')

print('model 3 embeddings vs model 1 interactions auc: '+str(auc_3_1))
print('model 3 embeddings vs model 2 interactions auc: '+str(auc_3_2))
print('model 3 embeddings vs model 3 interactions auc: '+str(auc_3_3))
print('model 3 embeddings vs model 4 interactions auc: '+str(auc_3_4)+'\n')

print('model 4 embeddings vs model 1 interactions auc: '+str(auc_4_1))
print('model 4 embeddings vs model 2 interactions auc: '+str(auc_4_2))
print('model 4 embeddings vs model 3 interactions auc: '+str(auc_4_3))
print('model 4 embeddings vs model 4 interactions auc: '+str(auc_4_4))

model 1 embeddings vs model 1 interactions auc: 0.914515
model 1 embeddings vs model 2 interactions auc: 0.9073612
model 1 embeddings vs model 3 interactions auc: 0.91035223
model 1 embeddings vs model 4 interactions auc: 0.9058233

model 2 embeddings vs model 1 interactions auc: 0.8811896
model 2 embeddings vs model 2 interactions auc: 0.9106597
model 2 embeddings vs model 3 interactions auc: 0.9187266
model 2 embeddings vs model 4 interactions auc: 0.90898114

model 3 embeddings vs model 1 interactions auc: 0.8624471
model 3 embeddings vs model 2 interactions auc: 0.9014988
model 3 embeddings vs model 3 interactions auc: 0.91681266
model 3 embeddings vs model 4 interactions auc: 0.89959764

model 4 embeddings vs model 1 interactions auc: 0.8806335
model 4 embeddings vs model 2 interactions auc: 0.9099839
model 4 embeddings vs model 3 interactions auc: 0.9180439
model 4 embeddings vs model 4 interactions auc: 0.9084542


## Compare Random Split

In [55]:
ratio_2 = 1-train_val_2['rating'].sum()/train_val_1['rating'].sum()
ratio_3 = 1-train_val_3['rating'].sum()/train_val_1['rating'].sum()
ratio_4 = 1-train_val_4['rating'].sum()/train_val_1['rating'].sum()

In [71]:
train_val_r, test_r = train_test_split(user_top_games, test_size=0.2, random_state=1337)
#train_r, val_r = train_test_split(train_val_r, test_size=ratio_2, random_state=1337)
#train_r, val_r = train_test_split(train_val_r, test_size=ratio_3, random_state=1337)
train_r, val_r = train_test_split(train_val_r, test_size=ratio_4, random_state=1337)

In [72]:
interactions_train_r = create_interaction_matrix(df = train_r,
                                                 user_col = 'user',
                                                 item_col = 'item',
                                                 rating_col = 'rating',
                                                 threshold = '1')
sparse_train_r = sparse.csr_matrix(interactions_train_r.values)

In [73]:
interactions_test_r = create_interaction_matrix(df = test_r,
                                         user_col = 'user',
                                         item_col = 'item',
                                         rating_col = 'rating',
                                         threshold = '1')
sparse_test_r = sparse.csr_matrix(interactions_test_r.values)

In [74]:
sqlalchemy_conn = create_sqlalchemy_connection('sqlalchemy_conn_str.txt')
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')

In [75]:
model_r = LightFM(**best_parameters)
model_r.fit(sparse_train_r,
            epochs=15,
            num_threads=30)

<lightfm.lightfm.LightFM at 0x7f7dc133acf8>

In [76]:
auc_r_1 = auc_score(model_r, sparse_test_1, sparse_train_1, num_threads=30).mean()
auc_r_2 = auc_score(model_r, sparse_test_2, sparse_train_2, num_threads=30).mean()
auc_r_3 = auc_score(model_r, sparse_test_3, sparse_train_3, num_threads=30).mean()
auc_r_4 = auc_score(model_r, sparse_test_4, sparse_train_4, num_threads=30).mean()

In [77]:
print('model r embeddings vs model 1 interactions auc: '+str(auc_r_1))
print('model r embeddings vs model 2 interactions auc: '+str(auc_r_2))
print('model r embeddings vs model 3 interactions auc: '+str(auc_r_3))
print('model r embeddings vs model 4 interactions auc: '+str(auc_r_4))

model r embeddings vs model 1 interactions auc: 0.9024247
model r embeddings vs model 2 interactions auc: 0.8963152
model r embeddings vs model 3 interactions auc: 0.8998613
model r embeddings vs model 4 interactions auc: 0.8946328


In [63]:
print(ratio_2)
print(ratio_3)
print(ratio_4)

0.2982303411265329
0.4067730061125876
0.31137760323086094
