In [2]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

In [169]:
ratings_df  = pd.read_csv('../data/ratings.csv')

In [170]:
# See how many unique items are there in the data to decide the value of k 

len(set(ratings_df['item_id']))

10974

In [65]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

# Holdout method

In [None]:
# I tried grid search but due to the size of the dataset, it took forever to run it so I tried different k and min_k
# a few times with holdout method.

In [6]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [11]:
sim_options = {
    'name': 'pearson', #similarity measure default is MSD
    'user_based': False #user-based CF
}

In [9]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=45,min_k=15) # number of neighbours = 45
knn.fit(trainingSet) #fit model to the training set
predictions_knn = knn.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [11]:
accuracy.rmse(predictions_knn, verbose=True) 

RMSE: 0.9613


0.9612744685383321

In [12]:
predictions_knn

[Prediction(uid='76561198084408440', iid=219740, r_ui=4.0, est=2.12871144775864, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='mcconnorhey', iid=263740, r_ui=1.0, est=1.6977014350098436, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197999617946', iid=67000, r_ui=1.0, est=1.4038124062393806, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561198015496188', iid=730, r_ui=3.0, est=3.3803069819728035, details={'actual_k': 19, 'was_impossible': False}),
 Prediction(uid='76561198039237769', iid=321040, r_ui=1.0, est=1.7682856579836368, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='Iskaroth', iid=6000, r_ui=1.0, est=1.0951443834316756, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='itsfilfyd', iid=203290, r_ui=1.0, est=1.849939578126255, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='Rabid_Ghost', iid=238010, r_ui=2.0, est=2.666179694644127, details={'a

# Recommendation

In [6]:
ui_df = pd.read_json("../data/australian_users_items_fixed.json")

In [7]:
ui_df.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [145]:
user_dict = {}
for i in range(len(ui_df)):
    user_dict[str(ui_df.iloc[i].user_id)] = ui_df.iloc[i].user_id

In [8]:
game_df = pd.read_json("../data/steam_games_fixed.json")

In [10]:
game_df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140.0,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980.0,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290.0,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400.0,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570.0,,,


In [143]:
game_dict = {}
for i in range(len(game_df)):
    game_dict[str(game_df.iloc[i].id)] = game_df.iloc[i].title

In [12]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it. Follow methodology provided previously
algo = KNNBasic(sim_options=sim_options,k=45,min_k=15)
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [46]:
testset = trainset.build_anti_testset()

In [47]:
# becuase the size of the dataset is too large, I reduced the size by choosing the first 1097400 rows
# choosing 1097400 is because there are 10974 unique items in rating.csv, I decide to test only 100 users
# 10974 x 100 = 1097400 
# The exact number of users is 103 because the number of unrated items for each user is not exact 10974
testset = testset[:1097400]

In [48]:
predictions = algo.test(testset)

In [49]:
predictions

[Prediction(uid='76561197970982479', iid=80, r_ui=1.9174700894758845, est=2.017179030453206, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=100, r_ui=1.9174700894758845, est=1.8513455121478133, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=3910, r_ui=1.9174700894758845, est=2.1425310753014, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=4000, r_ui=1.9174700894758845, est=2.109236173693135, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=6880, r_ui=1.9174700894758845, est=2.095955834897413, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=2200, r_ui=1.9174700894758845, est=1.5653684455253292, details={'actual_k': 45, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=2270, r_ui=1.9174700894758845, est=1.8209608815461253, details={'actual_k': 45, 

In [146]:
from collections import defaultdict

def getGameRecommendations(topN=3):
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions: 
        top_recs[uid].append((str(iid)+'.0', est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [147]:
recommendations = getGameRecommendations(3)

In [171]:
def getGameName(game_id):
    if game_id not in game_dict:
        return game_id
    g = game_dict[game_id]
    return g

In [172]:
def getGameRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_games = recommendations[u_id]
    game_list = []
    for game in recommended_games:
        game_list.append((getGameName(game[0]),game[1]))
    return game_list 

In [173]:
getGameRecommendationsForUser('js41637',recommendations)

[('Velocity®Ultra', 2.2278198534650135),
 ('Clergy Splode', 2.2246582730946436),
 ('Spaceport Hope', 2.2235641572979112)]

In [174]:
# In this case the data of the second game recommended is not in the game dataframe so I show the game id.
getGameRecommendationsForUser('76561197970982479',recommendations)

[('The Witcher 2: Assassins of Kings Enhanced Edition', 2.6942477536576575),
 ('49400.0', 2.681631012168471),
 ('Freespace 2', 2.669784283361961)]

# Precision at k

In [164]:
def precision_recall_at_k(predictions, k, threshold):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [167]:
from surprise.model_selection import KFold

kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=3, threshold=2.5)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Computing the pearson similarity matrix...
Done computing similarity matrix.
0.2928258967629029
0.13262963709330464
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.2931022408963577
0.13427521319588087
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.29523509655751434
0.13528467397819263
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.2916054307509264
0.13399216444944043
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.2989326334208216
0.1336469577514448
