In [1]:
import warnings
import re
import math
import json
import glob
import random
from collections import Counter, defaultdict
import statistics

import pathlib
import tqdm
from pathlib import Path
import numpy as np
from joblib import Parallel, delayed
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, csr_array, lil_array, lil_matrix, save_npz, load_npz

from playtime_functions import *

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [2]:
TEST_SIZE = 0.3

# Prepare data

In [3]:
games_path = 'games_jan2024.csv' #'games_clean.csv'
interactions_path = 'steam_game_libraries_v1.npz'

In [4]:
# load dataset with interactions
steamid_appid_playtime_arr = np.load(interactions_path)["arr_0"]

steamids = steamid_appid_playtime_arr[:,0]
appids = steamid_appid_playtime_arr[:,1]
hours = steamid_appid_playtime_arr[:,2] / 60

In [5]:
# games information
game_info_df = pd.read_csv(games_path)
game_info_df["TotalReviews"] = game_info_df["Positive"] + game_info_df["Negative"]

# remove trash games
game_info_df = game_info_df.query("TotalReviews >= 50")
popular_games = set(game_info_df['AppID'])
len(popular_games)

23297

In [6]:
similar_games_df = pd.read_csv("steam_game_similarities.csv")

# Create sparse interaction matrix

In [7]:
MIN_REVIEWS = 5
MAX_REVIEWS = 250
SUBSAMPLE = 1000
GAMES_FILTER = 250

steamids_num_reviews = sorted(Counter(steamids).items(), key=lambda item: item[1], reverse=True)

valid_steamids_list = [steamid for steamid, num_reviews in steamids_num_reviews \
                      if MIN_REVIEWS <= num_reviews <= MAX_REVIEWS]

valid_steamids_list = valid_steamids_list[:SUBSAMPLE]
valid_steamids = set(valid_steamids_list)

len(valid_steamids)

1000

In [8]:
# remove all users who have too little (< 5) or too many (> 250) reviews
# and all unpopular games
appids_filtered, steamids_filtered, hours_filtered = [], [], []
for i in tqdm.tqdm(range(len(steamids))):
    if steamids[i] in valid_steamids and appids[i] in popular_games:
        appids_filtered.append(appids[i])
        steamids_filtered.append(steamids[i])
        hours_filtered.append(hours[i])
        
appids, steamids, hours = appids_filtered, steamids_filtered, hours_filtered

100%|████████████████████████| 492846002/492846002 [01:01<00:00, 8022832.63it/s]


In [9]:
STEAMID_TO_INDEX = {steamid: i for i, steamid in enumerate(set(steamids))}
APPID_TO_INDEX = {appid: i for i, appid in enumerate(set(appids))}

INDEX_TO_STEAMID = {v: k for k, v in STEAMID_TO_INDEX.items()}
INDEX_TO_APPID = {v: k for k, v in APPID_TO_INDEX.items()}

remapped_steamids = [STEAMID_TO_INDEX[steamid] for steamid in steamids]
remapped_appids = [APPID_TO_INDEX[appid] for appid in appids]

In [10]:
steamid_appid_playtime_df = pd.DataFrame({'steamid': remapped_steamids,
                                          'appid': remapped_appids,
                                          'playtime_hours': hours})

# delete rows with zero playtime
steamid_appid_playtime_df = steamid_appid_playtime_df[steamid_appid_playtime_df['playtime_hours'] > 0]

In [11]:
popular_appid = steamid_appid_playtime_df.value_counts('appid').index[:GAMES_FILTER]
steamid_appid_playtime_df = steamid_appid_playtime_df[steamid_appid_playtime_df['appid'].isin(popular_appid)]
steamid_appid_playtime_df

Unnamed: 0,steamid,appid,playtime_hours
0,508,6,8.550000
6,508,32,8.650000
8,508,34,1.483333
10,508,95,27.616667
11,508,98,9.333333
...,...,...,...
223426,919,9405,67.733333
223431,919,1664,87.400000
223458,919,1496,16.383333
223468,919,9716,2.566667


In [12]:
# remove outliers by playtime for each game
# center the value on the average for the game

df_without_outliers = pd.DataFrame(columns=steamid_appid_playtime_df.columns)
for appid in tqdm.tqdm(remapped_appids):
    group_data = steamid_appid_playtime_df[steamid_appid_playtime_df['appid'] == appid]

    q99 = group_data['playtime_hours'].quantile(0.99)
    mean = group_data['playtime_hours'].mean()

    non_outliers = group_data[group_data['playtime_hours'] < q99]
    non_outliers['playtime_hours'] -= mean
    df_without_outliers = pd.concat([df_without_outliers, non_outliers])

# center the value on the average for the user
df_centered = pd.DataFrame(columns=steamid_appid_playtime_df.columns)
for steamid in tqdm.tqdm(remapped_steamids):
    group_data = steamid_appid_playtime_df[steamid_appid_playtime_df['steamid'] == steamid]

    mean = group_data['playtime_hours'].mean()

    group_data['playtime_hours'] -= mean
    df_centered = pd.concat([df_centered, group_data])

# center the value on the average in general
df_centered['playtime_hours'] -= steamid_appid_playtime_df['playtime_hours'].mean()

100%|█████████████████████████████████| 223473/223473 [5:40:50<00:00, 10.93it/s]
100%|█████████████████████████████████| 223473/223473 [4:22:25<00:00, 14.19it/s]


In [13]:
# determine the minimum number of games a user needs 
# to play in order to be included in the sample
for i in range(1,10):
    if int(i * TEST_SIZE) >= 1:
        CRITERION = i
        break

In [14]:
# create sparse interaction matrix
hours_final = df_centered['playtime_hours']
steamids_final = df_centered['steamid']
appids_final = df_centered['appid']

sparse_interaction_matrix = csr_matrix((hours_final, (steamids_final, appids_final)),
                                       shape=(max(steamids_final)+1, max(appids_final)+1))

nonzero_rows = sparse_interaction_matrix.getnnz(axis=1) >= CRITERION
sparse_interaction_matrix = sparse_interaction_matrix[nonzero_rows]

display(sparse_interaction_matrix, Counter(sparse_interaction_matrix.data))

<929x11967 sparse matrix of type '<class 'numpy.float64'>'
	with 57983 stored elements in Compressed Sparse Row format>

Counter({-50136.92614657324: 20,
         -43376.7729255022: 6,
         -33384.66784487353: 6,
         -28199.286432204426: 5,
         -43609.11614859634: 5,
         -25331.579496411545: 5,
         -30766.932397031716: 5,
         -31910.6612137275: 5,
         -57280.07977957959: 5,
         -43278.621628561516: 5,
         -43602.64146983172: 4,
         -35527.19145449245: 4,
         -36181.076343751025: 4,
         -88972.88051475213: 4,
         -24327.09260838026: 4,
         -56107.59935670775: 4,
         -30952.15319699723: 4,
         -34047.05696553778: 4,
         -72916.60440801409: 4,
         -58163.83097830188: 4,
         -44757.64402302076: 4,
         -30305.293328576092: 4,
         -24006.123025530655: 4,
         -42412.34237010692: 4,
         -56131.79809414569: 4,
         -39873.27317953922: 4,
         -67252.80426561808: 4,
         -39540.43736507866: 3,
         -36211.507083011384: 3,
         -36260.740416344684: 3,
         -33904.28134610991: 3,


In [15]:
# # Сохранение разреженной матрицы в файл
# save_npz('/home/artermiloff/Datasets/Steam/sparse_interaction_matrix.npz', sparse_interaction_matrix)

# Create train/test split

In [16]:
train_interaction_matrix, test_interactions = train_test_split(sparse_interaction_matrix, 
                                                               test_size=TEST_SIZE,
                                                               index_to_appid=INDEX_TO_APPID,
                                                               seed=42)

100%|██████████████████████████████████████| 929/929 [00:00<00:00, 18362.26it/s]


In [17]:
train_interaction_matrix

<929x11967 sparse matrix of type '<class 'numpy.float64'>'
	with 41011 stored elements in Compressed Sparse Row format>

# Algorithm Evaluation

In [18]:
K_VALUES = [1, 5, 10, 25, 50, 100]
N_JOBS = -1

In [19]:
game_info_df['categories'] = [row.split(',') if row is not np.nan else np.nan 
                              for row in game_info_df['Categories']]
game_info_df['genres'] = [row.split(',') if row is not np.nan else np.nan 
                          for row in game_info_df['Genres']]

In [20]:
random.seed(42)
NUM_SUBSAMPLE = 500 #43_000
metrics_steamids_sample = [it for it in random.sample(valid_steamids_list, NUM_SUBSAMPLE
                                              ) if STEAMID_TO_INDEX[it] in test_interactions]
metrics_steamids_full = [it for it in valid_steamids_list if STEAMID_TO_INDEX[it] in test_interactions]

In [21]:
recommendations_user_user_cf = Parallel(N_JOBS, verbose=1)(delayed(user_user_collaborative_filtering)(
    train_interaction_matrix, STEAMID_TO_INDEX[steamid], k_neighbors=100
) for steamid in metrics_steamids_sample)
recommendations_user_user_cf = [
    [INDEX_TO_APPID[ind] for ind in user_rec] for user_rec in recommendations_user_user_cf
]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 465 out of 465 | elapsed:    1.4s finished


In [22]:
recommendations_item_item_cf = Parallel(N_JOBS, verbose=1)(delayed(item_item_collaborative_filtering)(
    similar_games_df, 
    [INDEX_TO_APPID[i] for i in train_interaction_matrix[STEAMID_TO_INDEX[steamid], :].indices], 200
) for steamid in metrics_steamids_full)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 929 out of 929 | elapsed:    0.9s finished


In [23]:
recommendations_new = list()
for steamid in metrics_steamids_sample:
    recommendations_new.append(get_new_recommendations(game_info_df, train_interaction_matrix, 
                                                       STEAMID_TO_INDEX[steamid], INDEX_TO_APPID, 200))

In [26]:
recommendations_popular = list()
for steamid in metrics_steamids_sample:
    recommendations_popular.append(get_popular_recommendations(game_info_df, train_interaction_matrix, 
                                                           STEAMID_TO_INDEX[steamid], INDEX_TO_APPID, 200))

In [27]:
for alg_name, recommendations_list, steamids_list in [
    ("User-based Collaborative Filtering", recommendations_user_user_cf, metrics_steamids_sample),
    ("Item-based Collaborative Filtering", recommendations_item_item_cf, metrics_steamids_full),
    ("Popular Non-Personalized", recommendations_popular, metrics_steamids_full),
    ("New Non-Personalized", recommendations_new, metrics_steamids_full),
]:
    print(alg_name)
    ap_dict, ndcg_dict, re_dict, pr_dict = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)
    for steamid, predicted in tqdm.tqdm(zip(steamids_list, recommendations_list)):
        actual = test_interactions[STEAMID_TO_INDEX[steamid]]

        for k in K_VALUES:
            re_dict[k].append(recall_at_k(actual, predicted, k))
            pr_dict[k].append(precision_at_k(actual, predicted, k))
            ap_dict[k].append(average_precision_at_k(actual, predicted, k))
            ndcg_dict[k].append(ndcg_at_k(actual, predicted, k))
            
    for k in K_VALUES:
        print("K = ", k)
        print("mP@K\t", round(np.mean(ap_dict[k]), 3))
        print("mR@K\t", round(np.mean(ndcg_dict[k]), 3))
        print("mAP@K\t", round(np.mean(ap_dict[k]), 3))
        print("nDCG@K\t", round(np.mean(ndcg_dict[k]), 3))
    print()

User-based Collaborative Filtering


465it [00:00, 9198.30it/s]


K =  1
mP@K	 0.006
mR@K	 0.006
mAP@K	 0.006
nDCG@K	 0.006
K =  5
mP@K	 0.004
mR@K	 0.009
mAP@K	 0.004
nDCG@K	 0.009
K =  10
mP@K	 0.003
mR@K	 0.01
mAP@K	 0.003
nDCG@K	 0.01
K =  25
mP@K	 0.002
mR@K	 0.012
mAP@K	 0.002
nDCG@K	 0.012
K =  50
mP@K	 0.002
mR@K	 0.012
mAP@K	 0.002
nDCG@K	 0.012
K =  100
mP@K	 0.002
mR@K	 0.012
mAP@K	 0.002
nDCG@K	 0.012

Item-based Collaborative Filtering


929it [00:00, 10763.39it/s]


K =  1
mP@K	 0.025
mR@K	 0.025
mAP@K	 0.025
nDCG@K	 0.025
K =  5
mP@K	 0.031
mR@K	 0.071
mAP@K	 0.031
nDCG@K	 0.071
K =  10
mP@K	 0.034
mR@K	 0.104
mAP@K	 0.034
nDCG@K	 0.104
K =  25
mP@K	 0.032
mR@K	 0.116
mAP@K	 0.032
nDCG@K	 0.116
K =  50
mP@K	 0.032
mR@K	 0.116
mAP@K	 0.032
nDCG@K	 0.116
K =  100
mP@K	 0.032
mR@K	 0.116
mAP@K	 0.032
nDCG@K	 0.116

Popular Non-Personalized


465it [00:00, 14870.36it/s]


K =  1
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  5
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  10
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  25
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  50
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  100
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0

New Non-Personalized


465it [00:00, 14578.58it/s]

K =  1
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  5
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  10
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  25
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  50
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0
K =  100
mP@K	 0.0
mR@K	 0.0
mAP@K	 0.0
nDCG@K	 0.0




