In [None]:
import json
import pickle
import operator
import random
import math
import time
import datetime
from collections import Counter
from itertools import product
from copy import deepcopy
from IPython.display import display as dp

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import lightfm as LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank
from lightfm.data import Dataset
from lightfm import lightfm as lfm

In [None]:
import os
import sys
sys.path.append(os.path.abspath("../lib/cookbook/"))
from recsys import * ## recommender system cookbook
from generic_preprocessing import * ## pre-processing code

# Data Preprocessing

In [None]:
aussie_items = json.load(open('../data/fixed/australian_users_items_fixed.json','r'))
steam_games = json.load(open('../data/fixed/steam_games_fixed.json','r'))
steam_games_df = pd.read_json('../data/fixed/steam_games_fixed.json', orient='columns')

## data processing utilities: user-item datasets

In [None]:
def build_list(user_item_data, _id=False):
    user_item = []
    for user in user_item_data:
        for item in user['items']:
            if _id == True:
                user_item.append((user['user_id'], item['item_id']))
            else:
                user_item.append((user['user_id'], item['item_name']))
    return user_item


def build_df(user_item):
    df = pd.DataFrame(user_item, columns=['user', 'item'])
    df = df.drop_duplicates(['user', 'item'])
    #print(df.shape)
    df['own'] = 1
    df = df.pivot(index='user', columns='item', values='own')
    df = df.fillna(0)
    return df


def build_users(user_item_data, _id=False):
    user_feat = []
    for user in user_item_data:
        for item in user['items']:
            if _id == True:
                user_feat.append(
                    (user['user_id'], user['items_count'], item['item_id'],
                     item['playtime_forever'], item['playtime_2weeks']))
            else:
                user_feat.append(
                    (user['user_id'], user['items_count'], item['item_name'],
                     item['playtime_forever'], item['playtime_2weeks']))
    return user_feat


def build_users_df(user_feat):
    col = [
        'user', 'items_count', 'item', 'playtime_forever', 'playtime_2weeks'
    ]
    feat = pd.DataFrame(user_feat, columns=col)
    feat = feat.drop_duplicates(col)
    feat['item'] = feat['item'].astype(str, copy=True)
    feat['playhour'] = (feat['playtime_forever'].values / 60
                        )  # convert min to hour and round up
    feat['playhour'] = feat['playhour'].astype(int, copy=True)
    feat['playtime'] = (
        feat['playhour'].values / 10)  # reduce playtime variance range
    feat['playtime'] = feat['playtime'].astype(int, copy=True)
    return feat

## data processing utilities: game datasets

In [None]:
#------------------------------------
# utilities - check number is NaN
#------------------------------------
def isNaN(num):
    return num != num


#------------------------------------
# utilities - round up to nearest 10th
#------------------------------------
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10


#------------------------------------
# genres
#------------------------------------
def cleanup_genre(genres):
    if isNaN(genres) == True:
        return 'Unknown'
    else:
        return genres[0]


def generate_genre_mapping(genres):
    length = len(genres)
    l_val = range(1, length + 1)
    return dict(zip(genres, l_val))


def cleanup_genre_mapping(genre):
    return game_genre_map.get(genre, 0)


#------------------------------------
# release_date
#------------------------------------
def cleanup_year(dates):
    now = datetime.datetime.now()
    if isNaN(dates) == True:
        return 0
    else:
        _dates = dates.split('-')
        _year = _dates[0]
        try:
            _year = int(_year)
        except ValueError:
            _year = int(now.year)
        return (now.year - _year)


#------------------------------------
# metascore: this data is too sparse
#------------------------------------
def cleanup_metascore(score):
    if score == 'NA':
        return 0
    elif isNaN(score) == True:
        return 0
    else:
        return score


#------------------------------------
# price
#------------------------------------
def cleanup_price(price):
    if isinstance(price, str):
        if price.find("Free") > 0 or price.find("Demo") > 0:
            return 0
    elif isNaN(price) == True:
        return 0
    else:
        return round(price)


#------------------------------------
# clean up and build game datasets
#------------------------------------
def build_games_df(df, _id=True, _price=False, _date=False, _metascore=False):
    if _id == True:
        game_feat = df.reindex(
            columns=['id', 'genres', 'price', 'release_date', 'metascore'])
        game_feat.rename(columns={'id': 'item'}, inplace=True)
        game_feat['item'].fillna(0, inplace=True)
        game_feat['item'] = game_feat['item'].astype(int, copy=True)
        game_feat['item'] = game_feat['item'].astype(str, copy=True)
    else:
        game_feat = df.reindex(columns=[
            'app_name', 'genres', 'price', 'release_date', 'metascore'
        ])
        game_feat.rename(columns={'app_name': 'item'}, inplace=True)

    # metascore
    if _metascore == True:
        game_feat['metascore'] = game_feat['metascore'].apply(
            cleanup_metascore)
    else:
        game_feat.drop(['metascore'], axis=1, inplace=True)

    # price
    if _price == True:
        game_feat['price'] = game_feat['price'].apply(cleanup_price)
        game_feat['price'].fillna(0, inplace=True)
        game_feat['price'] = game_feat['price'].astype(int, copy=True)
    else:
        game_feat.drop(['price'], axis=1, inplace=True)

    # release date
    if _date == True:
        game_feat['age'] = game_feat['release_date'].apply(cleanup_year)
        game_feat['year'] = game_feat['release_date'].apply(cleanup_year)
    game_feat.drop(['release_date'], axis=1, inplace=True)

    game_feat.dropna(inplace=True)

    return game_feat


#----------------------------------------------------------
# calcurate median playtime of users for each game
#----------------------------------------------------------
def calculate_median_playtime(userdata, itemdata, _drop=True):
    playtime_forever = userdata[['item', 'playtime_forever']]
    playtime_2weeks = userdata[['item', 'playtime_2weeks']]

    median_playtime_forever = playtime_forever.groupby(['item']).median()
    median_playtime_2weeks = playtime_2weeks.groupby(['item']).median()
    new_df = pd.merge(
        median_playtime_forever, median_playtime_2weeks, on=['item'])
    new_df = pd.merge(itemdata, new_df, on=['item'])
    new_df['playhour_forever'] = round(new_df['playtime_forever'] / 60)
    new_df['playhour_2weeks'] = round(new_df['playtime_2weeks'] / 60)

    if _drop == True:
        new_df.drop(['playtime_forever'], axis=1, inplace=True)
        new_df.drop(['playtime_2weeks'], axis=1, inplace=True)
    return new_df

## data filtering utilities

In [None]:
#----------------------------------------------------------
# Filter by most owned games
#----------------------------------------------------------
def filter_top_n(user_item_df, n=500):
    top_n = user_item_df.sum().nlargest(n).index
    user_top_ngames = user_item_df[top_n].stack().reset_index()
    user_top_ngames = user_top_ngames.rename(columns={0: 'rating'})
    return user_top_ngames


#----------------------------------------------------------
# return the list of games of given datasets
#----------------------------------------------------------
def list_games(user_item_df):
    games = pd.DataFrame()
    games['item'] = user_item_df.item.drop_duplicates()
    return games


#----------------------------------------------------------
# return common game items list between two dataframes
#----------------------------------------------------------
def filter_common_games(df1, df2):
    list1 = sorted(df1.columns, reverse=False)
    list2 = sorted(df2['item'].values, reverse=False)
    common_item = sorted(list(set(list1).intersection(list2)), reverse=False)
    print('common item size:', len(common_item))
    df1_f = df1[common_item]
    df2_f = df2[df2['item'].isin(common_item)]
    return df1_f, df2_f


#----------------------------------------------------------
# filter top_list of items from dataframe and return
#----------------------------------------------------------
def filter_top_n_items(df, top_list):
    top_df = df[df['item'].isin(top_list)]
    return top_df

## user-item preprocessing

In [None]:
user_item = build_list(aussie_items, _id=True)
user_item_df = build_df(user_item)

## user-item preprocessing for user features

In [None]:
steam_users = build_users(aussie_items, _id=True)
steam_users_df = build_users_df(steam_users)

## game preprocessing for item features

In [None]:
steam_items_df = build_games_df(
    steam_games_df, _id=True, _price=True, _date=True, _metascore=False)
steam_items_all_df = calculate_median_playtime(steam_users_df, steam_items_df)

## filter common top N game items

In [None]:
#----------------------------------------------------------
# Filter common game items between user-item and game datasets
#----------------------------------------------------------
user_item_df, steam_items_all_df = filter_common_games(user_item_df,
                                                       steam_items_all_df)

#----------------------------------------------------------
# Filter top N games from user-item
# For current modeling,
# we are going to run our model with top 1000 games
#----------------------------------------------------------
n_games = 1000
user_top_games = filter_top_n(user_item_df, n_games)

#----------------------------------------------------------
# Compose a list of top N games
#----------------------------------------------------------
games = list_games(user_top_games)

#----------------------------------------------------------
# Filter top N items again
#----------------------------------------------------------
top_items = filter_top_n_items(steam_items_all_df, games['item'].values)
top_users = filter_top_n_items(steam_users_df, games['item'].values)

# list of data validation
print(top_items.shape, top_users.shape)

# Train and Test data preparation

## create train and test interactions

In [None]:
train_val, test = train_test_split(
    user_top_games, test_size=0.2, random_state=1337)

interactions_train_all = create_interaction_matrix(
    df=train_val,
    user_col='user',
    item_col='item',
    rating_col='rating',
    threshold='1')

interactions_test = create_interaction_matrix(
    df=test,
    user_col='user',
    item_col='item',
    rating_col='rating',
    threshold='1')

## create data dictionaries for future use

In [None]:
user_dict = create_user_dict(interactions=interactions_train_all)
user_dict_test = create_user_dict(interactions=interactions_test)
games_dict = create_item_dict(df=games, id_col='item', name_col='item')

## create sparse matrices from interactions

In [None]:
sparse_train_all = sparse.csr_matrix(interactions_train_all.values)
sparse_test = sparse.csr_matrix(interactions_test.values)

In [None]:
# data shape validation
print('user_top_games:', user_top_games.shape)
print('train_val:', train_val.shape)
print('test:', test.shape)
print('interactions_train_all:', interactions_train_all.shape)
print('interactions_test:', interactions_test.shape)
print('sparse_train_all:', sparse_train_all.shape)
print('sparse_test:', sparse_test.shape)

# Feature data preparation

In [None]:
#----------------------------------------------------------
# create item list
#----------------------------------------------------------
def create_item_list(
    dataset_df, 
    _genres=True, 
    _playhour_forever=False, 
    _playhour_2weeks=False, 
    _price=False, 
    _age=False):
    
    s = []
    if _genres==True:
        for i, x in dataset_df.iterrows():
            for i in x.genres:
                if i not in s:
                    s.append(i)

    if _playhour_forever==True:
        s.append('playhour_forever')
    if _playhour_2weeks==True:
        s.append('playhour_2weeks')
    if _price==True:
        s.append('price')
    if _age==True:
        s.append('age')

    return s

#----------------------------------------------------------
# create item feature
#----------------------------------------------------------
def create_item_feature(
    item_df, 
    _genres=True, 
    _playhour_forever=False, 
    _playhour_2weeks=False, 
    _price=False, 
    _age=False):
    z = 0
    diclist = []
    
    for i, x in item_df.iterrows():
        klist = []
        if _genres==True:
            for genre in x.genres:
                klist.append(genre)
        if _playhour_forever==True:
            klist.append('playhour_forever')
        if _playhour_2weeks==True:
            klist.append('playhour_2weeks')
        if _price==True:
            klist.append('price')
        if _age==True:
            klist.append('age')
        
        vlist = []
        if _genres==True:
            for i in range(len(x.genres)):
                vlist.append(1) 
        if _playhour_forever==True:
            vlist.append(x.playhour_forever)
        if _playhour_2weeks==True:
            vlist.append(x.playhour_2weeks)
        if _price==True:
            vlist.append(x.price)
        if _age==True:
            vlist.append(x.age)
            
        diclist.append((x['item'], dict(zip(klist, vlist))))

    df = pd.DataFrame({'features':diclist})
    return df, diclist

#----------------------------------------------------------
# normalize item feature values
#----------------------------------------------------------
def normalize_item_values(
    dataset_df, 
    _playhour_forever=False, 
    _playhour_2weeks=False, 
    _price=False, 
    _age=False):
    
    s = []
    if _playhour_forever==True:
        s.append('playhour_forever')
    if _playhour_2weeks==True:
        s.append('playhour_2weeks')
    if _price==True:
        s.append('price')
    if _age==True:
        s.append('age')

    dataset = dataset_df[s]
    x = preprocessing.normalize(dataset.values, norm='l1')
    return x

#----------------------------------------------------------
# convert a value to a binary type
#----------------------------------------------------------
def convert2binary(df):
    for column in df:
        df[column] = (df[column] > 0).astype(int)
    return df

# Train Models

In [None]:
def run_model(model_feat, train_data, test_data, item_feat=None, user_feat=None):
    train_auc = auc_score(model_feat, 
                          train_data, 
                          item_features=item_feat,
                          user_features=user_feat).mean()
    print('AUC: train %.3f.' % (train_auc))

    test_auc = auc_score(model_feat, 
                         test_data, 
                         train_data,
                         item_features=item_feat,
                         user_features=item_feat).mean()
    print('AUC: test %.3f.' % (test_auc))
    return train_auc, test_auc

## Model: Collaborative Filtering

### collaborative filtering - bpr

In [None]:
model_bpr = LightFM(
    no_components=30,
    learning_schedule='adagrad',
    loss='bpr',
    learning_rate=0.05,
    rho=0.95,
    epsilon=1e-05,
    item_alpha=0.001,
    user_alpha=0.001,
    random_state=1337)
start_time = time.time()
model_bpr.fit(sparse_train_all, epochs=15, num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(model_bpr, sparse_train_all, sparse_test)

### collaborative filtering - warp

In [None]:
model_warp = LightFM(
    no_components=50,
    learning_schedule='adagrad',
    loss='warp',
    learning_rate=0.05,
    k=5,
    n=10,
    rho=0.82,
    epsilon=1e-06,
    item_alpha=0.0005,
    user_alpha=0.0001,
    max_sampled=10,
    random_state=1337)
start_time = time.time()
model_warp.fit(sparse_train_all, epochs=15, num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(model_warp, sparse_train_all, sparse_test)

## Model: Hybrid (collaborative filtering + content-based)

### hybrid model 1
  - loss function: bpr
  - item features: genres, playhour_forever, playhour_2weeks, price, release_date
  - user features: none
  - feature normalization: lightfm library

In [None]:
#----------------------------------------------------------
# generate item list
#----------------------------------------------------------
top_items = top_items.reset_index(drop=True)
item_feature_list = create_item_list(
    top_items,
    _genres=True,
    _playhour_forever=True,
    _playhour_2weeks=True,
    _price=True,
    _age=True)

#----------------------------------------------------------
# create dataset
#----------------------------------------------------------
dataset = Dataset(user_identity_features=True, item_identity_features=True)
dataset.fit(
    users=set(user_top_games['user'].values),
    items=set(top_items['item'].values),
    user_features=None,  #user_features_list,
    item_features=item_feature_list)

#----------------------------------------------------------
# item features generation
#----------------------------------------------------------
item_feature, item_diclist = create_item_feature(
    top_items,
    _genres=True,
    _playhour_forever=True,
    _playhour_2weeks=True,
    _price=True,
    _age=True)
lfm_item_features = dataset.build_item_features(
    item_feature['features'].values)

lfm_item_features_df = pd.DataFrame(lfm_item_features.todense())

#----------------------------------------------------------
# create sum by each row
#----------------------------------------------------------
lfm_features_all_m = lfm_item_features_df.values
lfm_features_m = (lfm_features_all_m) / lfm_features_all_m.sum(
    axis=1, keepdims=1)

#----------------------------------------------------------
# create sparse matrix
#----------------------------------------------------------
sparse_features_all = sparse.csr_matrix(lfm_item_features_df.values)
sparse_features_all_m = sparse.csr_matrix(lfm_features_m)

In [None]:
model_hybrid_bpr = LightFM(
    no_components=30,
    learning_schedule='adagrad',
    loss='bpr',
    learning_rate=0.05,
    rho=0.95,
    epsilon=1e-05,
    item_alpha=0.001,
    user_alpha=0.001,
    random_state=1337)
start_time = time.time()
model_hybrid_bpr.fit(
    sparse_train_all,
    item_features=sparse_features_all_m,
    user_features=None,
    epochs=15,
    num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(
    model_hybrid_bpr,
    sparse_train_all,
    sparse_test,
    item_feat=sparse_features_all_m)

### hybrid model 2
  - loss function: warp
  - item features: genres, playhour_forever, playhour_2weeks, price, release_date
  - user features: none
  - feature normalization: lightfm library

In [None]:
model_hybrid_warp = LightFM(
    no_components=50,
    learning_schedule='adagrad',
    loss='warp',
    learning_rate=0.05,
    k=5,
    n=10,
    rho=0.82,
    epsilon=1e-06,
    item_alpha=0.0005,
    user_alpha=0.0001,
    max_sampled=10,
    random_state=1337)
start_time = time.time()
model_hybrid_warp.fit(
    sparse_train_all,
    item_features=sparse_features_all_m,
    user_features=None,
    epochs=15,
    num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(
    model_hybrid_warp,
    sparse_train_all,
    sparse_test,
    item_feat=sparse_features_all_m)

### hybrid model 3
  - loss function: bpr
  - item features: genres 
  - user features: none
  - feature normalization: sklearn library

In [None]:
#----------------------------------------------------------
# generate item list
#----------------------------------------------------------
top_items = top_items.reset_index(drop=True)
item_feature_list = create_item_list(
    top_items,
    _genres=True,
    _playhour_forever=False,
    _playhour_2weeks=False,
    _price=False,
    _age=False)

#----------------------------------------------------------
# create dataset
#----------------------------------------------------------
dataset = Dataset(user_identity_features=True, item_identity_features=True)
dataset.fit(
    users=set(user_top_games['user'].values),
    items=set(top_items['item'].values),
    user_features=None,  #user_features_list,
    item_features=item_feature_list)

#----------------------------------------------------------
# item features generation
#----------------------------------------------------------
item_feature, item_diclist = create_item_feature(
    top_items,
    _genres=True,
    _playhour_forever=False,
    _playhour_2weeks=False,
    _price=False,
    _age=False)

lfm_item_features = dataset.build_item_features(
    item_feature['features'].values)

lfm_item_features_df = pd.DataFrame(lfm_item_features.todense())

#----------------------------------------------------------
# create sum by each row
#----------------------------------------------------------
lfm_features_all_m = lfm_item_features_df.values
lfm_features_m = preprocessing.normalize(lfm_features_all_m, norm='l1', axis=1)

#----------------------------------------------------------
# create sparse matrix
#----------------------------------------------------------
sparse_features_all = sparse.csr_matrix(lfm_item_features_df.values)
sparse_features_all_m = sparse.csr_matrix(lfm_features_m)

In [None]:
model_hybrid_bpr = LightFM(
    no_components=30,
    learning_schedule='adagrad',
    loss='bpr',
    learning_rate=0.05,
    rho=0.95,
    epsilon=1e-05,
    item_alpha=0.001,
    user_alpha=0.001,
    random_state=1337)
start_time = time.time()
model_hybrid_bpr.fit(
    sparse_train_all,
    item_features=sparse_features_all_m,
    user_features=None,
    epochs=15,
    num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(
    model_hybrid_bpr,
    sparse_train_all,
    sparse_test,
    item_feat=sparse_features_all_m)

### hybrid model 4
  - loss function: bpr
  - item features: genres, playhour_forever, playhour_2weeks, price, release_date
  - user features: none
  - feature normalization: sklearn library - feature by feature normalization

In [None]:
#----------------------------------------------------------
# generate item list
#----------------------------------------------------------
top_items = top_items.reset_index(drop=True)

item_feature_list = create_item_list(
    top_items,
    _genres=True,
    _playhour_forever=True,
    _playhour_2weeks=True,
    _price=True,
    _age=True)

item_list1 = create_item_list(
    top_items,
    _genres=True,
    _playhour_forever=False,
    _playhour_2weeks=False,
    _price=False,
    _age=False)

item_list2 = create_item_list(
    top_items,
    _genres=False,
    _playhour_forever=True,
    _playhour_2weeks=True,
    _price=True,
    _age=True)

#----------------------------------------------------------
# create dataset
#----------------------------------------------------------
dataset = Dataset(user_identity_features=True, item_identity_features=True)
dataset.fit(
    users=set(user_top_games['user'].values),
    items=set(top_items['item'].values),
    user_features=None,
    item_features=item_list1)

#----------------------------------------------------------
# item features generation
#----------------------------------------------------------
item1_feature, item1_diclist = create_item_feature(
    top_items,
    _genres=True,
    _playhour_forever=False,
    _playhour_2weeks=False,
    _price=False,
    _age=False)
lfm_item1_features = dataset.build_item_features(
    item1_feature['features'].values)

lfm_item1_features_df = pd.DataFrame(lfm_item1_features.todense())
lfm_item1_features_df = convert2binary(lfm_item1_features_df)

item_norm = normalize_item_values(
    top_items,
    _playhour_forever=True,
    _playhour_2weeks=True,
    _price=True,
    _age=True)
lfm_item2_features_df = pd.DataFrame(item_norm)

lfm_features_all = pd.concat([lfm_item1_features_df, lfm_item2_features_df],
                             axis=1,
                             sort=False)

#----------------------------------------------------------
# create sum by each row
#----------------------------------------------------------
lfm_features_all_m = lfm_features_all.values
lfm_features_m = (lfm_features_all_m) / lfm_features_all_m.sum(
    axis=1, keepdims=1)

#----------------------------------------------------------
# create sparse matrix
#----------------------------------------------------------
sparse_features_all = sparse.csr_matrix(lfm_features_all.values)
sparse_features_all_m = sparse.csr_matrix(lfm_features_m)

In [None]:
model_hybrid_bpr = LightFM(
    no_components=30,
    learning_schedule='adagrad',
    loss='bpr',
    learning_rate=0.05,
    rho=0.95,
    epsilon=1e-05,
    item_alpha=0.001,
    user_alpha=0.001,
    random_state=1337)
start_time = time.time()
model_hybrid_bpr.fit(
    sparse_train_all,
    item_features=sparse_features_all_m,
    user_features=None,
    epochs=15,
    num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(
    model_hybrid_bpr,
    sparse_train_all,
    sparse_test,
    item_feat=sparse_features_all_m)

### hybrid model 5
  - loss function: warp
  - item features: genres, playhour_forever, playhour_2weeks, price, release_date
  - user features: none
  - feature normalization: sklearn library - feature by feature normalization

In [None]:
model_hybrid_warp = LightFM(
    no_components=50,
    learning_schedule='adagrad',
    loss='warp',
    learning_rate=0.05,
    k=5,
    n=10,
    rho=0.82,
    epsilon=1e-06,
    item_alpha=0.0005,
    user_alpha=0.0001,
    max_sampled=10,
    random_state=1337)
start_time = time.time()
model_hybrid_warp.fit(
    sparse_train_all,
    item_features=sparse_features_all_m,
    user_features=None,
    epochs=15,
    num_threads=8)
end_time = time.time()
print('Time taken for model train: {}'.format(str(end_time - start_time)))

In [None]:
run_model(
    model_hybrid_warp,
    sparse_train_all,
    sparse_test,
    item_feat=sparse_features_all_m)