In [1]:
import json
import pickle as pkl
import operator
import time
from collections import Counter
from itertools import product
import random


import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import reciprocal_rank

%run '../lib/cookbook/recsys.py'
%run '../lib/cookbook/generic_preprocessing.py'
%run '../lib/utility.py'

from IPython.display import HTML ## Setting display options for Ipython Notebook

# Data Preprocessing

In [2]:
def filter_by_hours_played(path,minutes_played=None,percentile=None):
    with open(path) as f:
        df_items = json.load(f)    
    parsed_items = json_normalize(data=df_items, 
                                  record_path='items', 
                                  meta=['items_count','steam_id','user_id','user_url'])
    if minutes_played != None:
        parsed_items = parsed_items[parsed_items.playtime_forever > minutes_played]
    if percentile != None:
        quantiles = parsed_items.groupby('item_name')['playtime_forever'].quantile(q=percentile)
        parsed_items = pd.merge(parsed_items, pd.DataFrame(quantiles), how='left', left_on='item_name', right_index=True)
        parsed_items = parsed_items.loc[parsed_items['playtime_forever_x'] > parsed_items['playtime_forever_y']]
    return parsed_items

In [3]:
def isNaN(num):
   return num != num

def cleanup_price(price):
   if isinstance(price, str):
       if price.find("Free") > 0 or price.find("Demo") > 0:
           return 0
   elif isNaN(price) == True:
       return 0
   else:
       return round(price)

In [4]:
def build_list(df, games, free_games=False):
    game_ids = []
    if free_games == False:
        game_ids = []
        for i in games:
            if 'price' in i.keys() and 'id' in i.keys():
                price = cleanup_price(i['price'])
                if price != None and price > 0:
                   game_ids.append(i['id'])
        df = df.loc[df['item_id'].isin(game_ids)]
        user_items = df[['user_id','item_name']].values.tolist()
    if free_games == True:
        for i in games:
            if 'id' in i.keys():
                game_ids.append(i['id'])
        df = df.loc[df['item_id'].isin(game_ids)]
        user_items = df[['user_id','item_name']].values.tolist()
    return user_items, game_ids

In [5]:
test = filter_by_hours_played('../data/fixed/australian_users_items_fixed.json')
steam_games = json.load(open('../data/fixed/steam_games_fixed.json','r'))
test2, test3 = build_list(test, steam_games, free_games=True)

In [6]:
#Build maxtrix of users and games
def build_df(user_item):
    df = pd.DataFrame(user_item,columns=['user','item'])
    df = df.drop_duplicates(['user','item'])
    df['own'] = 1
    df = df.pivot(index='user',columns='item',values='own')
    df = df.fillna(0)
    return df

In [7]:
## Filter by most owned games

def filter_top_n(user_item_df, n=1000):
    top_n = user_item_df.sum().nlargest(n).index
    user_top_ngames = user_item_df[top_n].stack().reset_index()
    user_top_ngames = user_top_ngames.rename(columns={0:'rating'})
    return user_top_ngames

In [8]:
def list_games(user_item_df):
    games=pd.DataFrame()
    games['item']=user_item_df.item.drop_duplicates()
    return games

In [9]:
def filter_top_n_for_filtered(user_item_df, games):
    for column in games.item:
        if column not in user_item_df.columns:
            user_item_df[column] = 0.0
    return filter_top_n(user_item_df[games['item'].tolist()],10000)

In [10]:
filtered_hours_items = filter_by_hours_played('../data/fixed/australian_users_items_fixed.json', minutes_played=30)
filtered_percentile_items = filter_by_hours_played('../data/fixed/australian_users_items_fixed.json', percentile=0.10)
full_aussie_items = filter_by_hours_played('../data/fixed/australian_users_items_fixed.json')
aussie_reviews = json.load(open('../data/fixed/australian_user_reviews_fixed.json','r'))
steam_games = json.load(open('../data/fixed/steam_games_fixed.json','r'))
#Top 20 movies recommended which saved from temporary .csv file
top20_lst = pd.read_csv('../data/fixed/top20_user_titleslist.csv',index_col=None )

In [16]:
#Convert Steam games into dataframe
meta_df = pd.DataFrame(steam_games)

# Data Extraction - Find the games title/index of each selected genre in Steam metadata 

In [52]:

#Test 5 kind of popular genres in Steam metadata

action = []
sport = []
rpg = []
simulation = []
casual = []

for i in range(0,meta_df.shape[0]):
    if not(type(meta_df.genres[i]) is float) and 'Action' in meta_df.genres[i]:
        action.append((meta_df.id[i],meta_df.app_name[i]))
    if not(type(meta_df.genres[i]) is float) and 'Sports' in meta_df.genres[i]:
        sport.append((meta_df.id[i],meta_df.app_name[i]))
    if not(type(meta_df.genres[i]) is float) and 'RPG' in meta_df.genres[i]:
        rpg.append((meta_df.id[i],meta_df.app_name[i]))
    if not(type(meta_df.genres[i]) is float) and 'Simulation' in meta_df.genres[i]:
        simulation.append((meta_df.id[i],meta_df.app_name[i]))
    if not(type(meta_df.genres[i]) is float) and 'Casual' in meta_df.genres[i]:
        casual.append((meta_df.id[i],meta_df.app_name[i]))


In [53]:
#action
tmp1 = [i[0] for i in action]
tmp1 = filter(lambda v: v==v, tmp1)
tmp1 = [i for i in tmp1]

#sport
tmp2 = [i[0] for i in sport]
tmp2 = filter(lambda v: v==v, tmp2)
tmp2 = [i for i in tmp2]

#rpg
tmp3 = [i[0] for i in rpg]
tmp3 = filter(lambda v: v==v, tmp3)
tmp3 = [i for i in tmp3]

#simulation
tmp4 = [i[0] for i in simulation]
tmp4 = filter(lambda v: v==v, tmp4)
tmp4 = [i for i in tmp4]

#casual
tmp5 = [i[0] for i in casual]
tmp5 = filter(lambda v: v==v, tmp5)
tmp5 = [i for i in tmp5]

action_games = full_aussie_items['item_name'][full_aussie_items.item_id.isin(tmp1)]
sport_games = full_aussie_items['item_name'][full_aussie_items.item_id.isin(tmp2)]
rpg_games = full_aussie_items['item_name'][full_aussie_items.item_id.isin(tmp3)]
simulation_games = full_aussie_items['item_name'][full_aussie_items.item_id.isin(tmp4)]
casual_games = full_aussie_items['item_name'][full_aussie_items.item_id.isin(tmp5)]

# Store game names into pickle files
pkl.dump(action_games,open('../data/preprocessed_data/all_games/action_games.pkl', 'wb'))

pkl.dump(sport_games,open('../data/preprocessed_data/all_games/sport_games.pkl', 'wb'))

pkl.dump(rpg_games,open('../data/preprocessed_data/all_games/rpg_games.pkl', 'wb'))

pkl.dump(simulation_games,open('../data/preprocessed_data/all_games/simulation_games.pkl', 'wb'))

pkl.dump(casual_games,open('../data/preprocessed_data/all_games/casual_games.pkl', 'wb'))

In [61]:
#Additional 
#Strategy, Adventure, Indie, Racing, Singleplayer, Classic, VR, Sci-fi
game_type = ['Strategy', 'Adventure', 'Indie', 'Racing', 'Singleplayer', 'Classic', 'VR', 'Sci-fi']
games = []
for j in game_type:
    tmp = []
    for i in range(0,meta_df.shape[0]):
        if not(type(meta_df.tags[i]) is float) and j in meta_df.tags[i]:
            tmp.append((meta_df.id[i],meta_df.app_name[i]))
    games.append(tmp)

#processed games after filtering NaN
p_games = []
for item in games:
    tmp = [i[0] for i in item]
    tmp = filter(lambda v: v==v, tmp)
    tmp = [i for i in tmp]
    p_games.append(tmp)
    
#Now retrieve game names
all_genres = []
for item in p_games:
    all_genres.append(full_aussie_items['item_name'][full_aussie_items.item_id.isin(item)])
 



In [62]:
#Store into pickle files
count = 0
for item in game_type:
    param = '../data/preprocessed_data/all_games/'+ item + '_games.pkl'
    pkl.dump(all_genres[count],open(param, 'wb'))
    count = count + 1

### All Games

In [30]:
filtered_hours_user_item = build_list(filtered_hours_items, steam_games, free_games=True)
filtered_percentile_user_item = build_list(filtered_percentile_items, steam_games, free_games=True)
full_user_item = build_list(full_aussie_items, steam_games, free_games=True)

filtered_hours_user_item_df = build_df(filtered_hours_user_item)
filtered_percentile_user_item_df = build_df(filtered_percentile_user_item)
full_user_item_df = build_df(full_user_item)

In [31]:
user_top_games = filter_top_n(full_user_item_df,1000)
games = list_games(user_top_games)
user_top_games_filtered_hours = filter_top_n_for_filtered(filtered_hours_user_item_df, games)
user_top_games_filtered_percentile = filter_top_n_for_filtered(filtered_percentile_user_item_df, games)

In [32]:
pkl.dump(user_top_games,open('../data/preprocessed_data/all_games/user_top_games.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours,open('../data/preprocessed_data/all_games/user_top_games_filtered_hours.pkl', 'wb'))
pkl.dump(user_top_games_filtered_percentile,open('../data/preprocessed_data/all_games/user_top_games_filtered_percentile.pkl', 'wb'))
pkl.dump(games,open('../data/preprocessed_data/all_games/games.pkl', 'wb'))

### No Free Games

In [33]:
filtered_hours_user_item = build_list(filtered_hours_items, steam_games, free_games=False)
filtered_percentile_user_item = build_list(filtered_percentile_items, steam_games, free_games=False)
full_user_item = build_list(full_aussie_items, steam_games, free_games=False)

filtered_hours_user_item_df = build_df(filtered_hours_user_item)
filtered_percentile_user_item_df = build_df(filtered_percentile_user_item)
full_user_item_df = build_df(full_user_item)

In [34]:
user_top_games = filter_top_n(full_user_item_df,1000)
games = list_games(user_top_games)
user_top_games_filtered_hours = filter_top_n_for_filtered(filtered_hours_user_item_df, games)
user_top_games_filtered_percentile = filter_top_n_for_filtered(filtered_percentile_user_item_df, games)

In [35]:
user_top_games_filtered_hours.head()

Unnamed: 0,user,item,rating
0,--000--,Counter-Strike: Global Offensive,1.0
1,--000--,Garry's Mod,1.0
2,--000--,Left 4 Dead 2,1.0
3,--000--,Terraria,1.0
4,--000--,Portal 2,0.0


In [44]:
pkl.dump(user_top_games,open('../data/preprocessed_data/no_free_games/user_top_games.pkl', 'wb'))
pkl.dump(user_top_games_filtered_hours,open('../data/preprocessed_data/no_free_games/user_top_games_filtered_hours.pkl', 'wb'))
pkl.dump(user_top_games_filtered_percentile,open('../data/preprocessed_data/no_free_games/user_top_games_filtered_percentile.pkl', 'wb'))
pkl.dump(games,open('../data/preprocessed_data/no_free_games/games.pkl', 'wb'))