# Model Output Data Process
  - Process model output data to easily fit as the input data to build a dashboard
  - Process game title images to add to a dashboard

In [None]:
import json
import csv
import pickle
import operator
import random
import math
import time
import datetime
from collections import Counter
from itertools import product
from PIL import Image
from os import listdir
from os.path import splitext

import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

from IPython.display import display as dp

## dataset preprocess

In [None]:
def build_list(user_item_data, _id=False):
    user_item = []
    for user in user_item_data:
        for item in user['items']:
            if _id == True:
                user_item.append((user['user_id'], item['item_id']))
            else:
                user_item.append((user['user_id'], item['item_name']))
    return user_item


def build_df(user_item):
    df = pd.DataFrame(user_item, columns=['user', 'item'])
    df = df.drop_duplicates(['user', 'item'])
    #print(df.shape)
    df['own'] = 1
    df = df.pivot(index='user', columns='item', values='own')
    df = df.fillna(0)
    return df


#------------------------------------
# utilities
#------------------------------------
def isNaN(num):
    return num != num


#------------------------------------
# genres
#------------------------------------
def cleanup_genre(genres):
    if isNaN(genres) == True:
        return 'Unknown'
    else:
        return genres  #genres[0]


def generate_genre_mapping(genres):
    length = len(genres)
    l_val = range(1, length + 1)
    return dict(zip(genres, l_val))


def cleanup_genre_mapping(genre):
    return game_genre_map.get(genre, 0)


#------------------------------------
# release_date
#------------------------------------
def cleanup_year(dates):
    now = datetime.datetime.now()
    if isNaN(dates) == True:
        return 0
    else:
        _dates = dates.split('-')
        _year = _dates[0]
        return (_year)


#------------------------------------
# metascore
#------------------------------------
def cleanup_metascore(score):
    if score == 'NA':
        return 0
    elif isNaN(score) == True:
        return 0
    else:
        return score


#------------------------------------
# price
#------------------------------------
def cleanup_price(price):
    if isinstance(price, str):
        if price.find("Free") > 0 or price.find("Demo") > 0:
            return 0
    elif isNaN(price) == True:
        return 0
    else:
        return round(price)


#------------------------------------
# round up a value to nearest 10th
#------------------------------------
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10


#------------------------------------
# generate games data from a raw game datasets
#------------------------------------
def build_games_df(df, _price=True, _metascore=True, _date=True):
    game_feat = df.reindex(columns=[
        'id', 'title', 'url', 'genres', 'price', 'release_date', 'metascore'
    ])
    game_feat.rename(columns={'id': 'item'}, inplace=True)
    game_feat['item'] = game_feat['item'].fillna(0)
    game_feat['item'] = game_feat['item'].astype(int, copy=True)
    game_feat['item'] = game_feat['item'].astype(str, copy=True)

    # metascore
    if _metascore == True:
        game_feat['metascore'] = game_feat['metascore'].apply(
            cleanup_metascore)
    else:
        game_feat.drop(['metascore'], axis=1, inplace=True)

    # price
    if _price == True:
        game_feat['price'] = game_feat['price'].apply(cleanup_price)
        game_feat['price'] = game_feat['price'].fillna(0)
        game_feat['price'].replace(np.inf, 0, inplace=True)
        game_feat['price'] = game_feat['price'].astype(int, copy=True)
    else:
        game_feat.drop(['price'], axis=1, inplace=True)

    # release date
    if _date == True:
        game_feat['year'] = game_feat['release_date'].apply(cleanup_year)
    game_feat.drop(['release_date'], axis=1, inplace=True)

    game_feat.dropna(inplace=True)

    return game_feat


def filter_common_games(df1, df2):
    list1 = sorted(df1.columns, reverse=False)
    list2 = sorted(df2['item'].values, reverse=False)
    common_item = sorted(list(set(list1).intersection(list2)), reverse=False)
    print('common item size:', len(common_item))
    df1_f = df1[common_item]
    df2_f = df2[df2['item'].isin(common_item)]
    return df1_f, df2_f


def filter_top_n(user_item_df, n=500):
    top_n = user_item_df.sum().nlargest(n).index
    user_top_ngames = user_item_df[top_n].stack().reset_index()
    user_top_ngames = user_top_ngames.rename(columns={0: 'rating'})
    return user_top_ngames


def filter_top_n_items(df, top_list):
    top_df = df[df['item'].isin(top_list)]
    return top_df


def list_games(user_item_df):
    games = pd.DataFrame()
    games['item'] = user_item_df.item.drop_duplicates()
    return games


def create_headerimg_url(item_id):
    hdr_img = 'https://steamcdn-a.akamaihd.net/steam/apps/' + str(
        item_id) + '/header.jpg'
    return hdr_img

In [None]:
aussie_items = json.load(open('../data/fixed/australian_users_items_fixed.json','r'))
steam_games_df = pd.read_json('../data/fixed/steam_games_fixed.json', orient='columns')

In [None]:
user_item = build_list(aussie_items, _id=True)
user_item_df = build_df(user_item)
steam_items_df = build_games_df(steam_games_df)

In [None]:
user_item_df, steam_items_df = filter_common_games(user_item_df, steam_items_df)

n_games = 1000
user_top_games = filter_top_n(user_item_df,n_games)

games = list_games(user_top_games)
top_items = filter_top_n_items(steam_items_df, games['item'].values)
top_items = filter_top_n_items(steam_items_df, games['item'].values)
top_items = top_items.reset_index(drop=True)
top_items['hdr_img'] = top_items['item'].apply(create_headerimg_url)

In [None]:
steam_items_df = steam_items_df.reset_index(drop=True)
steam_items_df.drop(['url'], axis=1, inplace=True)
steam_items_df['hdr_img'] = steam_items_df['item'].apply(create_headerimg_url)

In [None]:
foutput = '../data/top_1000_games.csv'
top_items.to_csv(foutput)

foutput = '../data/all_common_games.csv'
steam_items_df.to_csv(foutput)

## model output preprocess

In [None]:
def build_game_id_name(user_item_data):
    user_feat = []
    col = ['item', 'title']
    for user in user_item_data:
        for item in user['items']:
            user_feat.append((item['item_id'], item['item_name']))

    feat = pd.DataFrame(user_feat, columns=col)
    feat = feat.drop_duplicates(col)
    return feat

def replace_title2id(title):
    return game_title2id_dic.get(title, title)

def replace_id2title(title):
    return game_id2title_dic.get(title, title)

def replace_title2url(title):
    return game_title2url_dic.get(title, title)

def replace_id2url(title):
    return game_id2url_dic.get(title, title)

def game_name2id(df):
    for column in df:
        if column.find("Rank")==0:
            df[column] = df[column].apply(replace_title2id)
    return df

def game_name2url(df):
    for column in df:
        if column.find("Rank")==0:
            df[column] = df[column].apply(replace_title2url)
    return df

def game_id2url(df):
    for column in df:
        if column.find("Rank")==0:
            df[column] = df[column].apply(replace_id2url)
    return df

def create_game_id_dic(df):
    game_dic = pd.Series(df.item.values, index=df.title.values).to_dict()
    return game_dic

def create_game_title_dic(df):
    game_dic = pd.Series(df.title.values, index=df.item.values).to_dict()
    return game_dic
    
def create_game_url_dic(df):
    game_dic = pd.Series(df.hdr_img.values, index=df.title.values).to_dict()
    return game_dic

def create_game_id2url_dic(df):
    game_dic = pd.Series(df.hdr_img.values, index=df.item.values).to_dict()
    return game_dic

def convert_result(game_dic, fresult, foutput, _csv=True, _title2url=False, _id2url=False):
    # read result
    if _csv==True:
        result_df = pd.read_csv(fresult, index_col=None)
    else:
        data = json.load(open(fresult,'r'))
        result_df = json_normalize(data)
        result_df.fillna(0, inplace=True)
    if _title2url == True:
        result = game_name2url(result_df)
    elif _id2url == True:
        result = game_id2url(result_df)
    else:
        result = game_name2id(result_df)  
    
    # save result
    result.to_csv(foutput, index=False)
    return result

In [None]:
#----------------------------------------------------------
# read data files
#----------------------------------------------------------
f_common_games = '../data/all_common_games.csv'
f_all_games = '../data/item_title_url.csv'

id_name_df = build_game_id_name(aussie_items)
games_df = pd.read_csv(f_all_games, index_col=None)

#----------------------------------------------------------
# create dictionaries for future use
#----------------------------------------------------------
game_id2title_dic = create_game_title_dic(games_df)
game_title2id_dic = create_game_id_dic(games_df)
game_title2url_dic = create_game_url_dic(games_df)
game_id2url_dic = create_game_id2url_dic(games_df)

#----------------------------------------------------------
# save output
#----------------------------------------------------------
id_name_df = id_name_df.reset_index(drop=True)
id_name_df['hdr_img'] = id_name_df['item'].apply(create_headerimg_url)

foutput = '../data/item_title_url.csv'
id_name_df.to_csv(foutput,index=False)

## Generate intput data files for dashboard

In [None]:
def convert_result(fresult, foutput, _csv=False, _user_item=True, _debug=False):
    if _csv == True:
        result_df = pd.read_csv(fresult, index_col=None)
    else: #json format
        output_df = pd.read_json(fresult, orient='columns')
        result_df = output_df.T

    if _user_item == True:
        col = 'Users'
    else:
        col = 'Items'
        
    # users
    result_df.reset_index(inplace=True)
    result_df.rename(columns={'index': col}, inplace=True)
    users = result_df[col]
    users_repeat = users.loc[users.index.repeat(20)]
    users_repeat = users_repeat.reset_index(drop=True)

    # ranking
    rankings = result_df.drop([col], axis=1)
    rankings = rankings.rename(
        columns={x: y
                 for x, y in zip(rankings.columns, range(1, 21))})
    stack_rankings = rankings.stack().reset_index()
    stack_rankings = stack_rankings.drop(['level_0'], axis=1)

    if _user_item == True:
        stack_rankings.rename(
            columns={
                'level_1': 'rank',
                0: 'title'
            }, inplace=True)
        # concat
        output = pd.concat([users_repeat, stack_rankings], axis=1)
        output.rename(columns={col: 'user'}, inplace=True)
        #item id
        output['item'] = output['title'].apply(replace_title2id)
        # img url
        output['imgurl'] = output['item'].apply(replace_id2url)
    else:
        stack_rankings.rename(
            columns={
                'level_1': 'rank',
                0: 'rec_title'
            }, inplace=True)
        # concat
        output = pd.concat([users_repeat, stack_rankings], axis=1)
        output.rename(columns={col: 'main_title'}, inplace=True)
        # main item id
        output['main_item'] = output['main_title'].apply(replace_title2id)
        # main imgurl
        output['main_imgurl'] = output['main_item'].apply(replace_id2url)
        # title
        output['rec_item'] = output['rec_title'].apply(replace_title2id)
        # imgurl
        output['rec_imgurl'] = output['rec_item'].apply(replace_id2url)
        # reindex columns
        output = output.reindex([
            'main_item', 'main_title', 'main_imgurl', 'rank', 'rec_item',
            'rec_title', 'rec_imgurl'], axis=1)

    # save the output
    if _debug == True:
        print(output.shape)
        output.head(2)

    output.to_csv(foutput, index=False)

### generate user to item recommendations result

In [None]:
fresult = '../outputs/user_item_dict.json'
foutput = '../outputs/top20_user_items_result.csv'

convert_result(fresult, foutput, _user_item=True)

### generate item to item recommendations result

In [None]:
fresult = '../outputs/item_item_dict.json'
foutput = '../outputs/top20_item_items_result.csv'

convert_result(fresult, foutput, _user_item=False)

## Filter users and items - optional

### filter the first 50 users
 - For the recommender model demo, we picked the first 50 users to build the dashboard
 - This will light-weight the dashboard to reduce its size and run faster with local machines

In [None]:
def slice_n_users(fresult, foutput, num_users, _debug=False):
    output_df = pd.read_csv(fresult, index_col=None, low_memory=False)
    output_50_df = output_df.loc[0:((num_users*20)-1)]
    output_50_df.to_csv(foutput, index=False)
    if _debug == True:
        print(output_50_df.shape)
        print(dp(output_50_df.tail(3)))
    
fresult = '../outputs/top20_user_items_result.csv'
foutput = '../outputs/top20_50user_items_result.csv'
num_users = 50

slice_n_users(fresult, foutput, num_users)

### filter user-item recommendations list for the first 50 users

In [None]:
fresult = '../outputs/top20_50user_items_result.csv'

output_50_df = pd.read_csv(fresult, index_col=None, low_memory=False)
itemlist = sorted(output_50_df['item'].unique())

fresult = '../outputs/top20_user_item_50users_result.csv'
with open(fresult, 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(itemlist)

### filter item-item recommendations list for the first 50 users

In [None]:
fresult = '../outputs/top20_item_items_result.csv'
item_df = pd.read_csv(fresult, index_col=None, low_memory=False)

reclist = []
for i, r in item_df.iterrows():
    if r.main_item in itemlist:
        reclist.append(r.rec_item)
        
reclist_unique = sorted((set(reclist)))

fresult = '../outputs/top20_item_item_50users_result.csv'
with open(fresult, 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(reclist_unique)

## Image size reduction for a dashboard
 - some game title images are too big which increases dashboard size and slow performance
 - big images are reduced to run dashboard with local machine

In [None]:
def image_resize(source_path, target_path):
    for file in listdir(source_path):
        source_file = source_path+file
        target_file = target_path+file
        try:
            im = Image.open(source_file)
            im = im.resize((500, 500))
            im.save(target_file)
        except OSError:
            print('Cannot convert %s' % file)