# Recommendation Systems for Speedrun.com

# Recommendation by User ID using Bipartite Graph

This method of recommendation works by finding other users that have played the same game as a target user. The games that the other users have played are ranked by occurrence. Games that the target user has not played but the other users have played are then recommended in order.

In [3]:
import pandas as pd
import csv
import networkx as nx
import numpy as np

from collections import defaultdict, Counter
from operator import itemgetter
from datetime import datetime
from tqdm import tqdm

import scipy
import random

In [53]:
user_prefs_filename = "../data/users/user_preferences_with_metadata.csv"
user_prefs_df = pd.read_csv(user_prefs_filename)
user_prefs_df = user_prefs_df[(user_prefs_df['signup_date'].notna()) & (user_prefs_df['signup_date'] != "Null")]
user_prefs_df['signup_date'] = pd.to_datetime(user_prefs_df['signup_date'], format='%Y-%m-%dT%H:%M:%SZ')
user_prefs_df['signup_date'] = pd.to_datetime(user_prefs_df['signup_date'].dt.strftime('%Y-%m-%d'))
user_prefs_df = user_prefs_df[(user_prefs_df['signup_date'] < '2023-01-01')]

In [54]:
user_prefs_df.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,user,signup_date,location,num_games,games
count,335322,335322,335322,335322.0,335322
unique,335322,,257,,88806
top,j5wzz2qj,,us,,k6q4rqzd
freq,1,,101439,,5131
mean,,2020-06-28 13:40:53.271780608,,1.994465,
min,,2014-01-06 00:00:00,,1.0,
25%,,2019-09-16 00:00:00,,1.0,
50%,,2021-01-04 00:00:00,,1.0,
75%,,2021-09-25 00:00:00,,2.0,
max,,2022-12-31 00:00:00,,2059.0,


In [55]:
exploded_games_df = user_prefs_df.copy()
exploded_games_df['games'] = exploded_games_df['games'].str.split(',')
exploded_games_df = exploded_games_df.explode('games').rename(columns = {'games': 'game_id', 'user':'user_id'})[['user_id', 'game_id']]

In [56]:
exploded_games_df.describe()

Unnamed: 0,user_id,game_id
count,668788,668788
unique,335322,31425
top,kj9521v8,k6q4rqzd
freq,2059,6979


In [57]:
bipartite_graph = nx.Graph()

# Users have a bipartite value of 0, games have a bipartite value of 1.
bipartite_graph.add_nodes_from(set(exploded_games_df['user_id'].values), bipartite=0)
bipartite_graph.add_nodes_from(set(exploded_games_df['game_id'].values), bipartite=1)
bipartite_graph.add_edges_from([(user, game) for user, game in zip(exploded_games_df['user_id'], exploded_games_df['game_id'])])

In [58]:
nx.is_bipartite(bipartite_graph)

True

In [59]:
del user_prefs_filename, user_prefs_df, exploded_games_df

# Overlapping Set Similarity with Limiting the Graph

There are two methods of limiting the number of user-item interactions in our bipartite graph. We can either use the mean and standard deviation of the `num_games` column, or limit based on the integer number of games played by a given user. For example, we can either use three standard deviations of the mean to have a cutoff value of `24.2 (3 s.f.)`, or we can use the value of `2` for users that have played only one game.

Using the method of standard deviations, we get a very similar output to the unlimited user-item interaction bipartite graph. We get popular games recommended most of the time. If we use the second approach, we get (anecdotally) more precise recommendations. However, the second method does not scale well, since we need to construct a different graph for each number of games played by each user. In reality, this isn't as bad as we think. Out of the 335,322 total users in our sample we can cover 306,371 users, or 91.4% (3 s.f.) of them with three graphs. 

In [60]:
def clean_user_preferences_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[(df['signup_date'].notna()) & (df['signup_date'] != "Null")]
    df['signup_date'] = pd.to_datetime(df['signup_date'], format='%Y-%m-%dT%H:%M:%SZ')
    df['signup_date'] = pd.to_datetime(df['signup_date'].dt.strftime('%Y-%m-%d'))
    df = df[(df['signup_date'] < '2023-01-01')]
    return df

def limit_number_games_user_preferences_df(df: pd.DataFrame, num_games: int) -> pd.DataFrame:
    return df[(df['num_games'] <= num_games)]

def explode_games_played(df: pd.DataFrame) -> pd.DataFrame:
    df['games'] = df['games'].str.split(',')
    return df.explode('games').rename(columns = {'games': 'game_id', 'user':'user_id'})

def recommendation_graph_for_n_games_played(df: pd.DataFrame, n: int) -> tuple[pd.DataFrame, nx.Graph]:
    df = clean_user_preferences_df(df)
    df = limit_number_games_user_preferences_df(df, n+1)
    df = explode_games_played(df)
    bipartite_graph = nx.Graph()
    bipartite_graph.add_nodes_from(set(df['user_id'].values), bipartite=0)
    bipartite_graph.add_nodes_from(set(df['game_id'].values), bipartite=1)
    bipartite_graph.add_edges_from([(user, game) for user, game in zip(df['user_id'], df['game_id'])])
    return df, bipartite_graph

In [61]:
user_prefs_df = pd.read_csv('../data/users/user_preferences_with_metadata.csv')
user_prefs_df, bipartite_graph = recommendation_graph_for_n_games_played(user_prefs_df, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['signup_date'] = pd.to_datetime(df['signup_date'], format='%Y-%m-%dT%H:%M:%SZ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['signup_date'] = pd.to_datetime(df['signup_date'].dt.strftime('%Y-%m-%d'))


In [62]:
def user_similarity(bipartite_graph: nx.Graph, total_item_nodes: int, user_a_id: str, user_b_id: str) -> float:
    assert bipartite_graph.nodes[user_a_id]['bipartite'] == 0
    assert bipartite_graph.nodes[user_b_id]['bipartite'] == 0

    a_neighbours = bipartite_graph.neighbors(user_a_id)
    b_neighbours = bipartite_graph.neighbors(user_b_id)
    shared_nodes = set(a_neighbours).intersection(b_neighbours)

    return len(shared_nodes) / total_item_nodes

def most_similar_users(bipartite_graph: nx.Graph, user_id: str) -> tuple[list[str], float]:
    all_users = set([user for user, value in bipartite_graph.nodes(data=True) if value['bipartite'] == 0])
    all_users.remove(user_id)

    total_item_nodes = 0
    for _, values in bipartite_graph.nodes(data=True):
        if values['bipartite'] == 1: total_item_nodes += 1

    similarities = defaultdict(float)
    for user in all_users:
        similarities[user] = user_similarity(bipartite_graph, total_item_nodes, user_id, user)

    max_similarity = max(similarities.values())
    return [user for user, similarity in similarities.items() if similarity == max_similarity], max_similarity

def recommend_games(bipartite_graph: nx.Graph, user_id: str) -> list[str]:
    similar_users, _ = most_similar_users(bipartite_graph, user_id)
    other_games = [game for user in similar_users for game in bipartite_graph.neighbors(user)]
    game_rankings = Counter(other_games)

    already_played_games = set(bipartite_graph.neighbors(user_id))

    try:
        [game_rankings.pop(game) for game in already_played_games]
    except KeyError:
        # If no other users in data set have played this game.
        pass

    ranked_games_in_order, _ = list(zip(*sorted(game_rankings.items(), key=itemgetter(1), reverse=True)))
    
    return ranked_games_in_order

In [63]:
games_metadata_df = pd.read_csv('../data/games/metadata/all_games.csv').rename(columns={'game_id': 'id'})

In [64]:
user = "x355n6qj"

played_games = list(bipartite_graph.neighbors(user))
print(f"Played games: {games_metadata_df[(games_metadata_df['id'].isin(played_games))].game_name.values}")

recommended_games = recommend_games(bipartite_graph, user)
print(f"Recommended games: {games_metadata_df.set_index('id').loc(axis=0)[recommended_games].game_name.values[:20]}")

Played games: ['Hello Neighbor' 'Super Mario Odyssey']
Recommended games: ["Baldi's Basics Category Extensions" 'Google Quick Draw'
 'Snipperclips: Cut it out  together!' 'Hello Neighbor 2' 'Cuphead'
 'Super Mario Sunshine' 'Marble Saga: Kororinpa'
 'The Legend of Zelda: The Wind Waker HD' 'Clicker Heroes'
 'Minecraft: Java Edition' 'Island Saver'
 'Super Mario Odyssey Category Extensions']


In [65]:
del games_metadata_df, played_games, recommended_games, user_prefs_df, bipartite_graph, user

# Recommendation Using a Game Similarity Matrix

This recommendation method works by creating a matrix of how users have rated different games. We construct this matrix by rating a game `1` if a user has played it, and `0` if not. We then normalise these values by making the sum of ratings by each user equal to `1`. This is also called [scaling to a unit length](https://en.wikipedia.org/wiki/Feature_scaling#Scaling_to_unit_length). We take the dot product of the matrix and the transposed matrix, and this gives us the similarity between each item in the data set. The method is taken from [here](https://towardsdatascience.com/recommender-systems-item-customer-collaborative-filtering-ff0c8f41ae8a). **This method does not scale very well**.

In [2]:
user_prefs_df = pd.read_csv('../data/users/user_preferences_with_metadata.csv')

In [3]:
def format_user_prefs_df_to_ratings(df: pd.DataFrame, number_users=-1) -> pd.DataFrame:
    tmp_df = df.copy()[:number_users]
    tmp_df['games'] = tmp_df['games'].str.split(',')
    tmp_df = tmp_df.explode('games').rename(columns = {'games': 'game_id', 'user':'user_id'})
    tmp_df['rating'] = 1
    return tmp_df[['user_id', 'game_id', 'rating']]

def construct_similarity_matrix(df: pd.DataFrame) -> pd.DataFrame:
    tmp_df = df.copy()
    tmp_df = pd.pivot_table(tmp_df, values='rating', index='user_id', columns='game_id')
    tmp_df = tmp_df.fillna(0.0)
    normalised_tmp_df = tmp_df / np.sqrt(np.square(tmp_df).sum(axis=0))
    return normalised_tmp_df.transpose().dot(normalised_tmp_df)

def construct_similar_games_df(df: pd.DataFrame, number_games: int) -> pd.DataFrame:
    similar_games_df = pd.DataFrame(index=df.columns, columns=range(0, number_games + 1))
    for i in range(0,len(df.columns)): 
        similar_games_df.iloc[i,:number_games+1] = df.iloc[0:,i].sort_values(ascending=False)[:number_games+1].index
    return similar_games_df.loc[:,1:]

def find_similar_games(df: pd.DataFrame, top_n_games=10):
    similarity_matrix = construct_similarity_matrix(df)
    similar_games_df = construct_similar_games_df(similarity_matrix, top_n_games)
    return similar_games_df

In [4]:
similar_games_df = pd.DataFrame()
generate = False

if generate:
    ratings_df = format_user_prefs_df_to_ratings(user_prefs_df, number_users=70000)
    similar_games_df = find_similar_games(ratings_df, 20)
    similar_games_df.to_csv('./test.csv')
else:
    similar_games_df = pd.read_csv('../data/users/similarity_recommendations_70000_users.csv').set_index('game_id')

In [5]:
len(similar_games_df.index)

28541

In [5]:
games_metadata_df = pd.read_csv('../data/games/metadata/all_games.csv').rename(columns={'game_id': 'id'})

In [7]:
game_id = "kdkmzmx1"
print(f"Played Game: {games_metadata_df.set_index('id').loc(axis=0)[game_id].game_name}")
recommended_games = similar_games_df.loc[game_id].values
print(f"Recommended games: {games_metadata_df.set_index('id').loc(axis=0)[recommended_games].game_name.values}")

Played Game: LEGO Harry Potter: Years 5-7 (DS/PSP)
Recommended games: ['LEGO Indiana Jones 2: The Adventure Continues (DS)'
 'LEGO Star Wars II: The Original Trilogy (PSP)' 'Escape PS1 Hagrid'
 'LEGO Star Wars III: The Clone Wars (DS)'
 'LEGO Pirates of the Caribbean: The Video Game (DS)' 'Harry Obby'
 'Demon Shift' 'Jumpix 2' 'Infinity Inc.' 'Kim Possible 3: Team Possible'
 'Hannah Montana' 'Zubo' 'Multiple Handheld LEGO Games'
 'LEGO Harry Potter: Years 1-4 (DS/PSP)' 'LEGO Batman: The Videogame (DS)'
 'LEGO Star Wars II: The Original Trilogy (DS)'
 'LEGO Indiana Jones: The Original Adventures (DS)'
 'LEGO Harry Potter Category Extensions'
 'LEGO Star Wars II: The Original Trilogy (GBA)'
 'LEGO Star Wars: The Complete Saga (DS)']


In [8]:
del game_id, generate, recommended_games, similar_games_df, user_prefs_df, games_metadata_df

# Recommendation using Node2Vec Embeddings

The idea behind using node2vec embeddings for recommendation is to predict future links that don't already exist. We can prove that this works for individual games recommendation by removing selected edges and using cosine similarity of embeddings to predict which edges should exist given this graph. We carry this on further by creating a pipeline to predict games to play when they are completely disconnected.

In [74]:
def generate_network_filter(filename: str, disallowed_games=None) -> dict[str, bool]:
    with open(filename, 'r', encoding='utf-8') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)
        filter_map = defaultdict(bool)
        for row in csv_reader:
            # Check if the created/release date is after 2023, if it is then we can ignore it in the network.
            release_date = datetime.strptime(row[3], "%Y-%m-%d")
            if row[4] == "None":
                # This is a completely random date before the final date.
                row[4] = "2017-10-22T05:21:29Z"
            created_date = datetime.strptime(row[4], "%Y-%m-%dT%H:%M:%SZ")

            if disallowed_games == None:
                disallowed_games = ["y65797de"]

            if created_date < datetime(2023, 1, 1) and release_date < datetime(2023, 1, 1) and row[0] not in disallowed_games:
                filter_map[row[0]] = True

    return filter_map

def get_weighted_edges_from_csv(filename: str, filter=None) -> list[tuple[str, str, int]]:
    with open(filename, 'r', encoding='utf-8') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)

        edges = list()
        for row in csv_reader:
            if filter is None:
                edges.append(tuple([row[0], row[1], int(row[2])]))
                continue

            if not filter.get(row[0]) or not filter.get(row[1]):
                continue

            edges.append(tuple([row[0], row[1], int(row[2])]))

    return edges

def create_weighted_graph(graph_filename: str, filter_filename: str):
    filter_map = generate_network_filter(filter_filename)
    edgelist = get_weighted_edges_from_csv(graph_filename, filter_map)
    graph = nx.DiGraph()
    graph.add_weighted_edges_from(edgelist)
    return graph

def generate_adjacency_matrix(graph: nx.DiGraph): 
    return nx.adjacency_matrix(graph, list(graph.nodes()))

def generate_non_existing_edges(adj_graph: scipy.sparse.csr_matrix, node_list: list[str], number_samples: int):
    non_existing_edges = []
    offset = 0
    for i in tqdm(range(adj_graph.shape[0])):
        for j in range(offset, adj_graph.shape[1]):
            if i != j:
                if adj_graph[i, j] == 0:
                    non_existing_edges.extend([node_list[i], node_list[j]])

        offset += 1
    return sorted(random.sample(non_existing_edges, k=number_samples))

def get_removable_edges(graph: nx.DiGraph):
    number_conected_components = nx.number_weakly_connected_components(graph)
    number_nodes = len(graph.nodes())
    tmp_graph = graph.copy()
    removable_edges = []
    for i in tqdm(list(tmp_graph.edges())):
        tmp_graph.remove_edge(i)

        if nx.number_weakly_connected_components(tmp_graph) == number_conected_components and \
            len(tmp_graph.nopdes()) == number_nodes:
            removable_edges.append(i)
            continue

        tmp_graph.add_edge(i)
    return removable_edges

In [75]:
adj_graph = None
games_graph = create_weighted_graph('../data/too_big/all_games_filtered.csv', '../data/games/metadata/all_games.csv')
generate = False

if generate:
    adj_graph = generate_adjacency_matrix(games_graph)
    scipy.sparse.save_npz('../data/too_big/all_games_adjacency_matrix.npz', adj_graph)
else:
    adj_graph = scipy.sparse.load_npz('../data/too_big/all_games_adjacency_matrix.npz')
print(adj_graph.shape)

(30433, 30433)


In [None]:
non_existing_edges = generate_non_existing_edges(adj_graph, list(games_graph.nodes()), 4000)
non_existing_edges_df = pd.DataFrame(data=non_existing_edges, columns=['source', 'target'])
non_existing_edges_df['connection'] = 0

In [None]:
removable_edges = get_removable_edges(games_graph)
removable_edges_df = pd.DataFrame(data=removable_edges, columns=['source', 'target'])
removable_edges_df['connection'] = 1

In [None]:
dataset = non_existing_edges_df.append(removable_edges_df, ignore_index=True)
graph_without_edges = games_graph.copy().remove_edges_from(removable_edges)