In [3]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

In [4]:
ratings_df  = pd.read_csv('../data/ratings.csv')

In [5]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

In [6]:
# See how many unique items are there in the data to decide what should be the biggest candidate k 

len(set(ratings_df['item_id']))

10974

# Grid Search 

In [7]:
param_grid = {'k': [5, 10, 25, 50, 100, 150],
              'sim_options': {'name': ['pearson', 'cosine','pearson_baseline'],
                              'min_support': [1, 5, 10, 25, 50, 75],   #the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items.
                              'user_based': [False]}
              }

In [8]:
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5) 

In [16]:
gs.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

In [None]:
# best RMSE score
print(gs.best_score['rmse'])

In [None]:
print(gs.best_params['rmse'])

In [None]:
knn = gs.best_estimator['rmse']
knn.fit(data.build_full_trainset())

# Holdout method

In [10]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [21]:
sim_options = {
    'name': 'pearson', #similarity measure default is MSD
    'user_based': False #user-based CF
}

In [13]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=25,min_k=15) #neighbours=3, other parameters set as above
knn.fit(trainingSet) #fit model to the training set
predictions_knn = knn.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [14]:
accuracy.rmse(predictions_knn, verbose=True) 

RMSE: 1.3555


1.3554619872924982

In [15]:
predictions_knn

[Prediction(uid='elgerjp', iid=96100, r_ui=2.0, est=2.040556944631146, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='76561197970147684', iid=252950, r_ui=4.0, est=1.6234746341670279, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='LongLiveAlex', iid=205790, r_ui=0.0, est=1.429257304975274, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid='76561198082801138', iid=201230, r_ui=0.0, est=1, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='InTranze', iid=399790, r_ui=2.0, est=2.081530008756818, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='76561198048334813', iid=113020, r_ui=0.0, est=1.1522665086942405, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='Sickosno', iid=263280, r_ui=4.0, est=1.2060579760266028, details={'actual_k': 25, 'was_impossible': False}),
 Prediction(uid='76561198062315054', iid=205790, r_ui=0.0, est=1, details={'actual_k': 25, 

# Recommendation

In [7]:
ui_df = pd.read_json("../data/australian_users_items_fixed.json")

In [8]:
ui_df.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [9]:
user_dict = {}
for i in range(len(ui_df)):
    user_dict[ui_df.iloc[i].user_id] = ui_df.iloc[i].steam_id

In [10]:
game_df = pd.read_json("../data/steam_games_fixed.json")

In [11]:
game_df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140.0,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980.0,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290.0,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400.0,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570.0,,,


In [12]:
game_dict = {}
for i in range(len(game_df)):
    game_dict[game_df.iloc[i].id] = game_df.iloc[i].app_name

In [22]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it. Follow methodology provided previously
algo = KNNBasic(sim_options=sim_options,k=25,min_k=15)
algo.fit(trainset)

# Find missing values and predict
anti_test_set = trainset.build_anti_testset() 

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [None]:
#predictions = []
#for i in range(len(anti_test_set)):
#    predict = algo.predict(anti_test_set[i][0], anti_test_set[i][1])
#    predictions.append(predict) 



In [None]:
predictions = algo.test(anti_test_set)

In [None]:
from collections import defaultdict

def getGameRecommendations(topN=3):
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions: 
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [None]:
recommendations = getGameRecommendations(3)

In [None]:
def getGameName(game_id):
    if game_id not in game_dict:
        return ""
    m = game_dict[game_id].split('[')
    temp = m[1].split(']')
    return temp[0]

In [None]:
def getGameRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_games = recommendations[u_id]
    game_list = []
    for game in recommended_games:
        game_list.append((getGameName(game[0]),game[1]))
    return game_list 

In [None]:
getGameRecommendationsForUser('76561197970982479',recommendations)

# Precision at k

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=2.5)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))