In [1]:
import json
import pickle as pkl
import operator
import time
from collections import Counter
from itertools import product
import random

import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank

%run '../lib/cookbook/recsys.py'
%run '../lib/cookbook/generic_preprocessing.py'
%run '../lib/utility.py'

from IPython.display import HTML ## Setting display options for Ipython Notebook

## Reload Data

In [2]:
## No Free Games
user_top_games = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games.pkl', 'rb'))
user_top_games_filtered_hours = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl', 'rb'))
user_top_games_filtered_percentile = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl', 'rb'))
games = pkl.load(open('../data/preprocessed_data/no_free_games/games.pkl', 'rb'))

## Train Test Split and Building Interaction Matrices

In [54]:
def _color_red_or_green(val):
    color = 'red' if val == 0 else 'green'
    return 'background: %s' % color

In [55]:
users = random.sample(list(interactions_full_all.index),10)
items = random.sample(list(interactions_full_all.columns),10)

In [56]:
test = interactions_full_all.loc[interactions_full_all.index.isin(users), interactions_full_all.columns.isin(items)]
test.style.applymap(_color_red_or_green)

item,Europa Universalis IV,Hurtworld,Interstellar Marines,ORION,Overlord: Raising Hell,Strike Suit Zero,Terraria,The Elder Scrolls IV: Oblivion,TowerFall Ascension,Valiant Hearts: The Great War™ / Soldats Inconnus : Mémoires de la Grande Guerre™
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
76561198040806617,0,0,0,0,0,0,0,0,0,0
76561198071059521,0,0,1,0,0,0,1,0,0,0
76561198077795250,0,0,0,0,0,0,0,0,0,0
76561198093085776,0,0,0,0,0,0,0,0,0,0
76561198096850767,0,0,0,1,0,0,1,0,0,0
Hueheueeheuee,0,0,0,0,0,0,0,0,0,0
Vladimirputinisgreatman,0,1,0,0,0,0,1,0,0,0
pinkie10,0,0,0,0,0,0,1,0,0,0
sonnymack,0,0,0,0,0,0,1,0,0,0
urami,0,0,0,0,0,0,0,0,0,0


In [3]:
interactions_full_all = create_interaction_matrix(df = user_top_games,
                                             user_col = 'user',
                                             item_col = 'item',
                                             rating_col = 'rating',
                                             threshold = '1')

In [4]:
interactions_full_filtered_hours = create_interaction_matrix(df = user_top_games_filtered_hours,
                                                             user_col = 'user',
                                                             item_col = 'item',
                                                             rating_col = 'rating',
                                                             threshold = '1')

In [5]:
interactions_full_filtered_percentile = create_interaction_matrix(df = user_top_games_filtered_percentile,
                                                                 user_col = 'user',
                                                                 item_col = 'item',
                                                                 rating_col = 'rating',
                                                                 threshold = '1')

In [18]:
user_dict = create_user_dict(interactions=interactions_full_filtered_percentile)
games_dict = create_item_dict(df=games, id_col='item', name_col='item')

In [7]:
## create sparse matrices from interactions
sparse_train = sparse.csr_matrix(interactions_full_all.values)
sparse_train_filtered_hours = sparse.csr_matrix(interactions_full_filtered_hours.values)
sparse_train_filtered_percentile = sparse.csr_matrix(interactions_full_filtered_percentile.values)

# Model Training

### Model 1 - All interactions trained

In [20]:
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')
model_1 = LightFM(**best_parameters)
start_time = time.time()
model_1.fit(sparse_train,
            epochs=15,
            num_threads=20)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time-start_time)))

Time taken for model train: 40.51724934577942


In [22]:
train_auc = auc_score(model_1, sparse_train_all, num_threads=8).mean()
print('AUC: train %.3f' % (train_auc))

AUC: train 0.858


### Model 3 - Only interactions with at least 30 minutes played

In [23]:
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')
model_3 = LightFM(**best_parameters)
start_time = time.time()
model_3.fit(sparse_train_filtered_hours,
            epochs=15,
            num_threads=20)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time-start_time)))

Time taken for model train: 18.25357723236084


In [24]:
train_auc = auc_score(model_3, sparse_train_filtered_hours).mean()
print('AUC: train %.3f' % (train_auc))

AUC: train 0.905


### Model 4 - Only interactions greater than 10th percentile of hours played

In [9]:
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')
model_4 = LightFM(**best_parameters)
start_time = time.time()
model_4.fit(sparse_train_filtered_percentile,
            epochs=15,
            num_threads=20)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time-start_time)))

Time taken for model train: 16.19506573677063


In [10]:
train_auc = auc_score(model_4, sparse_train_filtered_percentile).mean()
print('AUC: train %.3f' % (train_auc))

AUC: train 0.957


# Output results

### Model 4

In [19]:
item_item_dict = {}
item_item_dist_4 = create_item_emdedding_distance_matrix(model = model_4, interactions = interactions_full_all)
for i in games['item']:
    item_item_list_4 = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist_4,
                                    item_id = i,
                                    item_dict = games_dict,
                                    n_items = 20,
                                    show = False)
    item_item_dict[i] = item_item_list_4

In [20]:
user_item_dict = {}
for i in user_dict.keys():
    user_item_list_4 = sample_recommendation_user(model = model_4,
                                                interactions = interactions_full_all,
                                                user_id = i,
                                                user_dict = user_dict,
                                                item_dict = games_dict,
                                                nrec_items = 20,
                                                num_threads = 30,
                                                show = False)
    user_item_dict[i] = user_item_list_4

In [21]:
json.dump(item_item_dict,open('item_item_dict.json','w'))
json.dump(user_item_dict,open('user_item_dict.json','w'))

### Combined Model (scrapped idea)

In [176]:
def combine_model(user_id, model_1, model_2, top_n=20, weight=0.5):
    # Build ranked list for model_1 and model_2
    user_item_list_1 = sample_recommendation_user(model = model_1,
                                                    interactions = interactions_train_all,
                                                    user_id = user_id,
                                                    user_dict = user_dict,
                                                    item_dict = games_dict,
                                                    nrec_items = 1000,
                                                    show = False)
    user_item_list_2 = sample_recommendation_user(model = model_2,
                                                    interactions = interactions_train_all,
                                                    user_id = user_id,
                                                    user_dict = user_dict,
                                                    item_dict = games_dict,
                                                    nrec_items = 1000,
                                                    show = False)
    
    # Add ranked column
    user_item_df_1 = pd.DataFrame(user_item_list_1, columns=['game','score'])
    user_item_df_1['rank'] = user_item_df_1.index    
    user_item_df_2 = pd.DataFrame(user_item_list_2, columns=['game','score'])
    user_item_df_2['rank'] = user_item_df_2.index
    # Merge, rerank, output
    merged_df = user_item_df_1.merge(user_item_df_2, left_on='game', right_on='game')
    merged_df['new_rank'] = (weight*merged_df['rank_x']+(1-weight)*merged_df['rank_y'])/2
    merged_df.sort_values(['new_rank','rank_x'], inplace=True)
    return merged_df['game'].values.tolist()[0:top_n]

In [177]:
combine_model('76561198071544284',model_1,model_2)

['Counter-Strike: Global Offensive',
 'Portal 2',
 'PAYDAY 2',
 'The Elder Scrolls V: Skyrim',
 'Borderlands 2',
 'Portal',
 'Just Cause 2',
 'PlanetSide 2',
 'Warframe',
 'Arma 2',
 'Saints Row: The Third',
 'Rust',
 'PAYDAY: The Heist',
 "Sid Meier's Civilization V",
 'Arma 2: Operation Arrowhead',
 'Killing Floor',
 'Heroes & Generals',
 'War Thunder',
 'Fallout: New Vegas',
 'Loadout']