In [1]:
import json
import pickle as pkl
import operator
import time
from collections import Counter
from itertools import product
import random

import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank

%run '../lib/cookbook/recsys.py'
%run '../lib/cookbook/generic_preprocessing.py'
%run '../lib/utility.py'

from IPython.display import HTML ## Setting display options for Ipython Notebook

## Create cross fold splits

In [5]:
def create_cross_fold_splits(train_data, n_splits=5):
    # create cross validation splits
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    cross_val_splits = []
    fake_y = [1 for i in range(train_data.shape[0])]
    for train_index, val_index in skf.split(train_data, fake_y):
        X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
        interactions_train = create_interaction_matrix(df = X_train,
                                                         user_col = 'user',
                                                         item_col = 'item',
                                                         rating_col = 'rating',
                                                         threshold = '1')
        interactions_val = create_interaction_matrix(df = X_val,
                                                         user_col = 'user',
                                                         item_col = 'item',
                                                         rating_col = 'rating',
                                                         threshold = '1')
        sparse_train = sparse.csr_matrix(interactions_train.values)
        sparse_val = sparse.csr_matrix(interactions_val.values)
        cross_val_splits.append([sparse_train,sparse_val])
    return cross_val_splits

In [6]:
def build_cross_val_file(train_data, path, num):
    cross_val_splits = create_cross_fold_splits(train_data)
    with open(path+'cross_val_splits_model_'+str(num)+'.pkl','wb') as file:
        pkl.dump(cross_val_splits, file)

In [8]:
## No Free Games
user_top_games = pkl.load(open('../data/preprocessed_data/all_games/user_top_games.pkl','rb'))
user_top_games_filtered_hours = pkl.load(open('../data/preprocessed_data/all_games/user_top_games_filtered_hours.pkl','rb'))
user_top_games_filtered_percentile = pkl.load(open('../data/preprocessed_data/all_games/user_top_games_filtered_percentile.pkl','rb'))

build_cross_val_file(user_top_games,'../data/cross_val_splits/all_games/', 1)
build_cross_val_file(user_top_games_filtered_hours,'../data/cross_val_splits/all_games/', 2)
build_cross_val_file(user_top_games_filtered_percentile,'../data/cross_val_splits/all_games/', 3)

In [9]:
## No Free Games
user_top_games = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games.pkl','rb'))
user_top_games_filtered_hours = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl','rb'))
user_top_games_filtered_percentile = pkl.load(open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl','rb'))

build_cross_val_file(user_top_games,'../data/cross_val_splits/no_free_games/', 1)
build_cross_val_file(user_top_games_filtered_hours,'../data/cross_val_splits/no_free_games/', 2)
build_cross_val_file(user_top_games_filtered_percentile,'../data/cross_val_splits/no_free_games/', 3)

# Random Search for Hyperparameters

## Reload Cross Validation Data

In [None]:
## All Games
cross_val_splits_model_1 = pkl.load(open('../data/cross_val_splits/all_games/cross_val_splits_model_1.pkl', 'rb'))
cross_val_splits_model_2 = pkl.load(open('../data/cross_val_splits/all_games/cross_val_splits_model_2.pkl', 'rb'))
cross_val_splits_model_3 = pkl.load(open('../data/cross_val_splits/all_games/cross_val_splits_model_3.pkl', 'rb'))

In [2]:
## No Free Games
cross_val_splits_model_1 = pkl.load(open('../data/cross_val_splits/no_free_games/cross_val_splits_model_1.pkl', 'rb'))
cross_val_splits_model_2 = pkl.load(open('../data/cross_val_splits/no_free_games/cross_val_splits_model_2.pkl', 'rb'))
cross_val_splits_model_3 = pkl.load(open('../data/cross_val_splits/no_free_games/cross_val_splits_model_3.pkl', 'rb'))

## Train and Validate

In [2]:
def create_combinations(inp):
    combinations = [dict(zip(inp.keys(), values)) for values in product(*inp.values())]
    return combinations

In [3]:
possible_parameters =    {
                        'no_components': [20,30,40,50,60,70,80],
                        'learning_schedule': ['adagrad','adadelta'],
                        'loss': ['bpr','warp'],
                        'learning_rate': [0.05,0.01,0.005,0.001],
                        'rho': [0.99,0.97,0.95,0.92,0.90,0.87,0.85,0.82,0.80],
                        'epsilon': [1e-3,1e-04,1e-05,1e-06,1e-07],
                        'item_alpha': [0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],
                        'user_alpha': [0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],
                        'random_state': [1337]
                        }
parameter_combinations = create_combinations(possible_parameters)

In [5]:
len(parameter_combinations)

246960

In [12]:
def hyperparameter_validation(cross_val_splits, parameter_combinations, conn, table_name, model_number=1, n_iter=10, epochs=15, free_games='False', num_threads=8, verbose=False):
    # train and validate on splits
    for parameters in random.sample(parameter_combinations, n_iter):
        auc_sum = []
        start_time = time.time()
        for data in cross_val_splits:
            model = LightFM(**parameters)
            model.fit(data[0],
                          epochs=epochs,
                          num_threads=num_threads)
            auc = auc_score(model, data[1], data[0], num_threads=num_threads).mean()
            auc_sum.append(auc)
        model = LightFM(**parameters)
        params = model.get_params()
        params['model_number'] = model_number
        params['auc'] = np.mean(auc_sum)
        params['random_state'] = 1337
        params['epochs'] = epochs
        params['free'] = free_games
        df = pd.DataFrame(params, index=[0])
        df.to_sql(name=table_name, con=conn, if_exists='append')
        end_time = time.time()
        if verbose==True:
            print('Training complete for one set of parameters. Time taken: {}'.format(str(end_time-start_time)))
    return None

In [6]:
sqlalchemy_conn = create_sqlalchemy_connection('sqlalchemy_conn_str.txt')

### Model 1 Search

In [7]:
hyperparameter_validation(cross_val_splits_model_1, 
                          parameter_combinations, 
                          sqlalchemy_conn, 
                          'validation_metrics', 
                          model_number=1,
                          n_iter=20, 
                          epochs=15,
                          free_games='False',
                          num_threads=20,
                          verbose=True)

Training complete for one set of parameters. Time taken: 104.06417417526245
Training complete for one set of parameters. Time taken: 71.89473414421082
Training complete for one set of parameters. Time taken: 87.14681625366211
Training complete for one set of parameters. Time taken: 71.83574271202087
Training complete for one set of parameters. Time taken: 291.8480086326599
Training complete for one set of parameters. Time taken: 87.18681764602661
Training complete for one set of parameters. Time taken: 65.22408413887024
Training complete for one set of parameters. Time taken: 128.7828505039215
Training complete for one set of parameters. Time taken: 229.5739369392395
Training complete for one set of parameters. Time taken: 93.4676730632782
Training complete for one set of parameters. Time taken: 736.2447474002838
Training complete for one set of parameters. Time taken: 291.1644949913025
Training complete for one set of parameters. Time taken: 93.49912333488464
Training complete for one

### Model 2 Search

In [13]:
hyperparameter_validation(cross_val_splits_model_2, 
                          parameter_combinations, 
                          sqlalchemy_conn, 
                          'validation_metrics', 
                          model_number=2,
                          n_iter=20,
                          epochs=15,
                          free_games='False',
                          num_threads=20,
                          verbose=True)

Training complete for one set of parameters. Time taken: 65.15485620498657
Training complete for one set of parameters. Time taken: 61.00955295562744
Training complete for one set of parameters. Time taken: 67.28420448303223
Training complete for one set of parameters. Time taken: 66.97687268257141
Training complete for one set of parameters. Time taken: 57.73127579689026
Training complete for one set of parameters. Time taken: 471.49334144592285
Training complete for one set of parameters. Time taken: 55.734097480773926
Training complete for one set of parameters. Time taken: 59.06593632698059
Training complete for one set of parameters. Time taken: 46.89036250114441
Training complete for one set of parameters. Time taken: 397.27613401412964
Training complete for one set of parameters. Time taken: 807.9153068065643
Training complete for one set of parameters. Time taken: 612.6887094974518
Training complete for one set of parameters. Time taken: 53.10526204109192
Training complete for 

### Model 3 Search

In [None]:
hyperparameter_validation(cross_val_splits_model_3, 
                          parameter_combinations, 
                          sqlalchemy_conn, 
                          'validation_metrics', 
                          model_number=3,
                          n_iter=20, 
                          epochs=15,
                          free_games='False',
                          num_threads=20,
                          verbose=True)

Training complete for one set of parameters. Time taken: 419.403525352478
Training complete for one set of parameters. Time taken: 56.449491024017334
Training complete for one set of parameters. Time taken: 417.70055317878723
Training complete for one set of parameters. Time taken: 67.99230790138245
Training complete for one set of parameters. Time taken: 58.73055338859558


## Query Results

In [10]:
sqlalchemy_conn = create_sqlalchemy_connection('sqlalchemy_conn_str.txt')
pd.read_sql("SELECT * FROM validation_metrics WHERE model_number = 1 and loss = 'warp' ORDER BY auc DESC", con=sqlalchemy_conn)

Unnamed: 0,index,loss,learning_schedule,no_components,learning_rate,k,n,rho,epsilon,max_sampled,item_alpha,user_alpha,random_state,model_number,auc,epochs,free
0,0,warp,adadelta,50,0.005,5,10,0.97,1e-07,10,0.0005,0.0001,1337,1,0.909157,15,False
1,0,warp,adagrad,40,0.05,5,10,0.92,1e-05,10,0.001,0.0001,1337,1,0.900704,15,False
2,0,warp,adagrad,70,0.005,5,10,0.95,1e-06,10,0.0001,0.0001,1337,1,0.851992,15,False
3,0,warp,adadelta,60,0.01,5,10,0.99,1e-05,10,0.001,0.0001,1337,1,0.836723,15,False
4,0,warp,adadelta,40,0.05,5,10,0.87,1e-05,10,0.005,0.01,1337,1,0.834655,15,False
5,0,warp,adadelta,50,0.005,5,10,0.97,1e-06,10,0.0005,0.1,1337,1,0.833525,15,False
6,0,warp,adadelta,40,0.005,5,10,0.99,1e-06,10,0.0001,0.005,1337,1,0.833072,15,False
7,0,warp,adadelta,40,0.05,5,10,0.97,1e-06,10,0.001,0.1,1337,1,0.832792,15,False
8,0,warp,adadelta,60,0.001,5,10,0.82,1e-05,10,0.0005,0.005,1337,1,0.832099,15,False
9,0,warp,adadelta,60,0.005,5,10,0.95,1e-06,10,0.005,0.005,1337,1,0.832026,15,False


## Train and Test

In [None]:
train_val, test = train_test_split(user_top_games, test_size=0.2, random_state=1337)

In [None]:
interactions_train_all = create_interaction_matrix(df = train_val,
                                             user_col = 'user',
                                             item_col = 'item',
                                             rating_col = 'rating',
                                             threshold = '1')

In [None]:
interactions_test = create_interaction_matrix(df = test,
                                             user_col = 'user',
                                             item_col = 'item',
                                             rating_col = 'rating',
                                             threshold = '1')

In [None]:
best_parameters = query_best_parameters('sqlalchemy_conn_str.txt')

In [23]:
model = LightFM(**best_parameters)
model.fit(sparse_train_all,
            epochs=15,
            num_threads=8)

<lightfm.lightfm.LightFM at 0x7f4740f1a898>

In [25]:
train_auc = auc_score(model, sparse_train_all).mean()
test_auc = auc_score(model, sparse_test, sparse_train_all).mean()
print('AUC: train %.3f, test %.3f.' % (train_auc, test_auc))

AUC: train 0.852, test 0.844.
