Authors:
Björn Lindgren,
Fredrik Askeroth

Source code used in the thesis: "Enhancing Board Game Recommendations: Leveraging K-nearest neighbors in Collaborative Filtering"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_absolute_error
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

### Data preprocessing

In [None]:
df_ratings = pd.read_csv("dataset/user_ratings.csv")
df_games = pd.read_csv("dataset/games.csv")
df = pd.merge(df_ratings, df_games[['BGGId', 'Name']], 
              on='BGGId', how='left')

In [None]:
df

In [None]:
null_counts = df.isnull().sum()
print(null_counts)

In [None]:
df = df.dropna()

In [None]:
df

In [None]:
#Check if any rows are duplicates
duplicates = df.duplicated().value_counts()
print(duplicates)

Remove duplicate ratings for the same game by the same user (due to different version of the game etc)

In [None]:
# Merge the mean ratings back to the original DataFrame based on 'Username' and 'Name'
# Filter the DataFrame to keep only the original rows (non-duplicates) and the calculated mean ratings
# Drop the original 'Rating' column and rename the mean rating column
# Reindex the columns to move 'Rating' to the second position
mean_ratings = df.groupby(['Username', 'BGGId'])['Rating'].mean()
df_removed_duplicates = df.merge(mean_ratings, 
            on=['Username', 'BGGId'], suffixes=('', '_mean'))
df_unique = df_removed_duplicates[~df_removed_duplicates.
            duplicated(subset=['Username', 'BGGId'], keep=False)]
df_dropped = df_unique.drop(columns=['Rating']).rename(columns=
            {'Rating_mean': 'Rating'})
column_order = ['BGGId', 'Rating', 'Username', 'Name']
df_dropped = df_dropped.reindex(columns=column_order)


Calculate average rating for users and add average rating and adjusted rating column

In [None]:
# Step 1: Group by 'Username' and calculate the average rating for each user
# Step 2: Merge the average ratings back into the original DataFrame
# Step 3: Subtract the average rating from each rating to get the adjusted rating
# Now df contains the adjusted ratings in the 'Adjusted_Rating' column
user_avg_ratings = df_dropped.groupby(
    'Username')['Rating'].mean().reset_index()
df_adjusted_mean = pd.merge(df_dropped, user_avg_ratings, 
                            on='Username', suffixes=('', '_avg'))
df_adjusted_mean['Adjusted_Rating'] = df_adjusted_mean[
    'Rating'] - df_adjusted_mean['Rating_avg']
column_order = ['BGGId', 'Rating', 'Rating_avg', 
                'Adjusted_Rating', 'Username', 'Name']
df_adjusted_mean = df_adjusted_mean.reindex(columns=column_order)

In [None]:
df_adjusted_mean

Filter dataset

In [None]:
# Group by 'Username' and count the number of rows for each group
grouped_user_counts = df_adjusted_mean.groupby('Username').size()
grouped_user_counts

In [None]:
# Group by 'BGGId' and count the number of rows for each group
grouped_counts = df_adjusted_mean.groupby('BGGId').size()
grouped_counts

In [None]:
game_ratings = 3000
rating_counter = df_adjusted_mean['BGGId'].value_counts()
filtered_out = rating_counter[rating_counter < game_ratings].index
games_mask = df_adjusted_mean['BGGId'].isin(filtered_out)
df_adjusted_mean.drop(index=df_adjusted_mean[games_mask].index, 
                      inplace=True)

In [None]:
df_adjusted_mean

In [None]:
user_ratings = 100
rating_counter = df_adjusted_mean['Username'].value_counts()
filtered_out = rating_counter[rating_counter < user_ratings].index
user_mask = df_adjusted_mean['Username'].isin(filtered_out)
df_adjusted_mean.drop(index=df_adjusted_mean[user_mask].index, 
                      inplace=True)

In [None]:
df_adjusted_mean

In [None]:
# Group by 'Username' and count the number of rows for each group
grouped_user_counts = df_adjusted_mean.groupby('Username').size()
grouped_user_counts

In [None]:
# Group by 'BGGId' and count the number of rows for each group
grouped_counts = df_adjusted_mean.groupby('BGGId').size()
grouped_counts

In [None]:
grouped_by_username = df_adjusted_mean.groupby('Username').size().sort_values(ascending=False)
grouped_by_username

In [None]:
df_adjusted_mean

In [None]:
average_ratings = dict(zip(df_adjusted_mean['Username'], df_adjusted_mean['Rating_avg']))

Splitting the dataset into training and test set

In [None]:
def user_split(data, random_state, test_size=0.2):    
    train_to_concat = []
    test_to_concat = []
    for user, group in data.groupby('Username'):
        train_user, test_user = train_test_split(group, test_size=test_size, train_size=1-test_size, random_state=random_state)
        train_to_concat.append(train_user)
        test_to_concat.append(test_user)
    
    train = pd.concat(train_to_concat, ignore_index=True)
    test = pd.concat(test_to_concat, ignore_index=True)

    return train, test

In [None]:
# We have used two specific random states when splitting the dataset for reproduceability
train_set, test_set = user_split(data=df_adjusted_mean, 
                                 random_state=0, test_size=0.2)
# train_set, test_set = user_split(data=df_adjusted_mean, 
#                                 random_state=1, test_size=0.2)

In [None]:
train_set

In [None]:
test_set

Create datastructures for efficient computations

In [None]:
# Used to calculate MAE
actual_ratings_test_set = {}
for _, row in test_set.iterrows():
    username = row['Username']
    bggid = row['BGGId']
    rating = row['Rating']
    
    if username not in actual_ratings_test_set:
        actual_ratings_test_set[username] = {}
    
    actual_ratings_test_set[username][bggid] = rating

In [None]:
# Used to calculate predicted rating
actual_ratings_train_set = {}
for _, row in train_set.iterrows():
    username = row['Username']
    bggid = row['BGGId']
    rating = row['Rating']
    
    if username not in actual_ratings_train_set:
        actual_ratings_train_set[username] = {}
    
    actual_ratings_train_set[username][bggid] = rating

In [None]:
# Used to calculate MAE
rated_items_test_set = {}
for index, row in test_set.iterrows():
    username = row['Username']
    bggid = row['BGGId']
    
    if username not in rated_items_test_set:
        rated_items_test_set[username] = []
    
    rated_items_test_set[username].append(bggid)

In [None]:
# Used to calculate predicted rating
rated_items_train_set = {}
for index, row in train_set.iterrows():
    username = row['Username']
    bggid = row['BGGId']
    
    if username not in rated_items_train_set:
        rated_items_train_set[username] = []
    
    rated_items_train_set[username].append(bggid)

In [None]:
# Used in precision and recall calculations
relevant_items_df = test_set.loc[test_set['Rating'] >= 7]
relevant_items = {}

for username, bggid in relevant_items_df.groupby('Username')['BGGId']:
    bggid_list = list(bggid)
    relevant_items[username] = bggid_list

In [None]:
# Used to recommend items for evaluation
items_test_set = {}
for username, bggid in test_set.groupby('Username')['BGGId']:
    bggid_list = list(bggid)
    items_test_set[username] = bggid_list

### Similarity Calculation

In [None]:
# Pivot the DataFrame so a user matrix, so rows represent users and columns represent items
user_item_matrix = train_set.pivot(index='Username', 
                                   columns='BGGId', 
                                   values='Rating').fillna(0)
user_item_matrix_with_nan = train_set.pivot(index='Username', 
                                            columns='BGGId', 
                                            values='Adjusted_Rating')

In [None]:
# Used to recommend items
not_rated_items_train_set = {}
for username, row in user_item_matrix_with_nan.iterrows():
    nan_bggid_list = []
    for bggid, value in row.items():
        if pd.isna(value):
            nan_bggid_list.append(bggid)
    not_rated_items_train_set[username] = nan_bggid_list


In [None]:
# Transpose the user-item matrix to obtain an item-user matrix
item_user_matrix = user_item_matrix.T
item_user_matrix_adjusted = user_item_matrix_with_nan.T

In [None]:
def calculate_adjusted_cosine_similarity(matrix_transposed):
    sparse_matrix = csr_matrix(matrix_transposed.values)
    num_items = sparse_matrix.shape[0]

    similarity_matrix = np.zeros((num_items, num_items))

    for i in range(num_items):
        for j in range(i, num_items):  # Only compute upper triangle (similarity_matrix is symmetric)
            row_i = sparse_matrix.getrow(i)
            row_j = sparse_matrix.getrow(j)

            # Find common indices and adjust to exclude columns with NaN values
            common_indices = np.where(~np.isnan(row_i.toarray()) & ~np.isnan(row_j.toarray()))[1]
    
            if common_indices.size > 0:
                non_missing_values_i = row_i[:, common_indices].toarray().flatten()
                non_missing_values_j = row_j[:, common_indices].toarray().flatten()

                if len(non_missing_values_i) > 0 and len(non_missing_values_j) > 0:
                    similarity = cosine_similarity([non_missing_values_i], [non_missing_values_j])[0, 0]
                    similarity_matrix[i, j] = similarity
                    similarity_matrix[j, i] = similarity

    return similarity_matrix

In [None]:
# Calculate item similarity
# Create a DataFrame with item names as both rows and columns
# Set similarity to -5 for each item to itself, so that the similarity for each item to itself is less than the similarity to all other items
# Convert similarity to distances (inversely proportional to similarities)
item_similarity = calculate_adjusted_cosine_similarity(
    item_user_matrix_adjusted)
item_similarity_df = pd.DataFrame(
    item_similarity, 
    index=item_user_matrix_adjusted.index, 
    columns=item_user_matrix_adjusted.index)
np.fill_diagonal(item_similarity_df.values, -5)
distance_matrix = 1 - item_similarity_df

### General functions

In [None]:
def find_nearest_neighbors(distance_matrix, BGGId, nearest_neighbors_model):
    distances, neighbor_indices = nearest_neighbors_model.kneighbors([distance_matrix[BGGId]], return_distance=True)

    return distances, neighbor_indices

In [None]:
def find_rated_neighbors(neighbor_distances, neighbor_indices, rated_items):
    neighbor_indices = neighbor_indices[0]
    neighbor_distances = neighbor_distances[0]
    neighbors = item_user_matrix.iloc[neighbor_indices].index.tolist()
    rated_neighbors = []
    rated_neighbors_distances = []

    for i in range(len(neighbors)):
        if neighbors[i] in rated_items:
            rated_neighbors.append(neighbors[i])
            rated_neighbors_distances.append(neighbor_distances[i])
    
    return rated_neighbors_distances, rated_neighbors

In [None]:
def get_predicted_rating(item, username, distance_matrix, nearest_neighbors_model):
    rated_items = rated_items_train_set[username]
    distances, neighbor_indices = find_nearest_neighbors(distance_matrix, item, nearest_neighbors_model)
    rated_neighbors_distances, rated_neighbors = find_rated_neighbors(distances, neighbor_indices, rated_items)

    if len(rated_neighbors) != 0:
        numerator = 0
        denominator = 0
        for i in range(len(rated_neighbors)):
            neighbor = rated_neighbors[i]
            weight = 1 - rated_neighbors_distances[i] #transform distance into similarity
            user_rating = actual_ratings_train_set[username][neighbor]
            numerator += (weight * user_rating)
            denominator += weight
        predicted_rating_item = numerator / denominator
        return predicted_rating_item
    else:
        average_rating = average_ratings[username]
        return average_rating

In [None]:
#Main recommendation function
def recommend_items(distance_matrix, username, top_N, 
                    nearest_neighbors_model):
    not_rated_items = not_rated_items_train_set[username]
    predicted_ratings = []
    predicted_items = []

    for item in not_rated_items:
        predicted_rating_item = get_predicted_rating(item, username, distance_matrix, nearest_neighbors_model)
        predicted_ratings.append(predicted_rating_item)
        predicted_items.append(item)  
    
    recommended_items = pd.DataFrame(predicted_ratings, index=predicted_items, columns=['Predicted rating'])
    recommended_items.index.name = 'BGGId'
    sorted_recommendations = recommended_items.sort_values(by=['Predicted rating'], ascending=False)
    top_N_recommendations = sorted_recommendations.head(top_N)
    top_N_list = top_N_recommendations.index.tolist()
    return top_N_list

In [None]:
#Adjusted recommendation function for evaluation
def recommend_items_test_set(distance_matrix, username, top_N, nearest_neighbors_model):
    items = items_test_set[username]
    predicted_ratings = []
    predicted_items = []

    for item in items:
        predicted_rating_item = get_predicted_rating(item, username, distance_matrix, nearest_neighbors_model)
        predicted_ratings.append(predicted_rating_item)
        predicted_items.append(item)  
    
    recommended_items = pd.DataFrame(predicted_ratings, index=predicted_items, columns=['Predicted rating'])
    recommended_items.index.name = 'BGGId'
    sorted_recommendations = recommended_items.sort_values(by=['Predicted rating'], ascending=False)
    top_N_recommendations = sorted_recommendations.head(top_N)
    top_N_list = top_N_recommendations.index.tolist()
    return top_N_list

### Evaluation

In [None]:
def calculate_precision(recommended_items, relevant_items):
    true_positives = 0
    for item in recommended_items:
        if item in relevant_items:
            true_positives += 1
    
    return true_positives / len(recommended_items)


In [None]:
def average_precision(recommended_items, relevant_items):
    if not recommended_items or not relevant_items:
        return 0.0

    num_hits = 0
    sum_precisions = 0
    for i, item in enumerate(recommended_items):
        if item in relevant_items:
            num_hits += 1
            precision_at_i = num_hits / (i + 1)
            sum_precisions += precision_at_i

    if num_hits == 0:
        return 0.0

    return sum_precisions / num_hits    

In [None]:
def average_recall(recommended_items, relevant_items):
    if not recommended_items or not relevant_items:
        return 0.0

    num_hits = 0
    sum_recalls = 0
    total_relevant_items = len(relevant_items)
    
    if total_relevant_items == 0:
        return 0.0
    
    for i, item in enumerate(recommended_items):
        if item in relevant_items:
            num_hits += 1
            recall_at_i = num_hits / total_relevant_items
            sum_recalls += recall_at_i
    
    if num_hits == 0:
        return 0.0

    return sum_recalls / num_hits

In [None]:
def run_map_calculation(users, nearest_neighbors_model, top_N):
    results_average_precision = []

    for user in users:
        recommended_items = recommend_items(distance_matrix, user, top_N, nearest_neighbors_model)
        relevant_items_user = relevant_items[user]
        ap = average_precision(recommended_items, relevant_items_user)
        results_average_precision.append(ap)

    return np.sum(results_average_precision) / len(users)

In [None]:
# Calcaulates MAP and MAR on recommendations based on all items
def run_map_mar_calculation(users, nearest_neighbors_model, top_N):
    results_average_precision = []
    results_average_recall = []

    for user in users:
        recommended_items = recommend_items(distance_matrix, user, 
                                    top_N, nearest_neighbors_model)
        relevant_items_user = relevant_items[user]
        ap = average_precision(recommended_items, 
                               relevant_items_user)
        results_average_precision.append(ap)
        ar = average_recall(recommended_items, relevant_items_user)
        results_average_recall.append(ar)


    return np.sum(results_average_precision) / len(users), np.sum(
        results_average_recall) / len(users)

In [None]:
# Calculates MAP and MAR on recommendations based on items with known preferences
def run_map_mar_items_test_set(users, nearest_neighbors_model, top_N):
    results_average_precision = []
    results_average_recall = []

    for user in users:
        recommended_items = recommend_items_test_set(distance_matrix, user, top_N, nearest_neighbors_model)
        relevant_items_user = relevant_items[user]
        ap = average_precision(recommended_items, relevant_items_user)
        results_average_precision.append(ap)
        ar = average_recall(recommended_items, relevant_items_user)
        results_average_recall.append(ar)


    return np.sum(results_average_precision) / len(users), np.sum(results_average_recall) / len(users)

In [None]:
def calculate_recall(recommended_items, relevant_items):
    true_positives = 0
    for item in recommended_items:
        if item in relevant_items:
            true_positives += 1
    
    return true_positives / len(relevant_items)

In [None]:
# Calcaulates precision and recall on recommendations based on all items
def run_precision_recall_calculation(users, nearest_neighbors_model, top_N):
    results_precision = []
    results_recall = []

    for user in users:
        recommended_items = recommend_items(distance_matrix, user, top_N, nearest_neighbors_model)
        relevant_items_user = relevant_items[user]
        precision = calculate_precision(recommended_items, relevant_items_user)
        results_precision.append(precision)
        recall = calculate_recall(recommended_items, relevant_items_user)
        results_recall.append(recall)

    return np.sum(results_precision) / len(users), np.sum(results_recall) / len(users)

In [None]:
# Calculates precision and recall on recommendations based on items with known preferences
def run_precision_recall_items_test_set(users, nearest_neighbors_model, top_N):
    results_precision = []
    results_recall = []

    for user in users:
        recommended_items = recommend_items_test_set(distance_matrix, user, top_N, nearest_neighbors_model)
        relevant_items_user = relevant_items[user]
        precision = calculate_precision(recommended_items, relevant_items_user)
        results_precision.append(precision)
        recall = calculate_recall(recommended_items, relevant_items_user)
        results_recall.append(recall)

    return np.sum(results_precision) / len(users), np.sum(results_recall) / len(users)

In [None]:
def calculate_mean_absolute_error(username, distance_matrix, nearest_neighbors_model):
    actual_ratings = []
    predicted_ratings = []
    rated_items_testset = rated_items_test_set[username]

    for item in rated_items_testset:
        predicted_ratings.append(get_predicted_rating(item, username, distance_matrix, nearest_neighbors_model))
        actual_rating = actual_ratings_test_set[username][item]
        actual_ratings.append(actual_rating)
    
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae

In [None]:
def get_user_sample(random_seed, size):
    sample = set()
    num_users, num_items = user_item_matrix.shape
    np.random.seed(random_seed)
    while len(sample) < size:
        random_user_index = np.random.randint(0, num_users)
        random_user = user_item_matrix.index[random_user_index]
        sample.add(random_user)
    return sample
    

In [None]:
def run_mean_absolute_error_calculation(users, 
                                        nearest_neighbors_model):
    results = []
    for user in users:
        mae = calculate_mean_absolute_error(
            user, distance_matrix, nearest_neighbors_model)
        results.append(mae)
    return np.sum(results) / len(users)


### Experiments

In [None]:
# Used in all MAE experiments
# Experiments are performed using random_seed 0 and 1 for two different split of the dataset into training and test set
user_sample = get_user_sample(1, 1000)
# user_sample = get_user_sample(0, 1000)
k_values = [5,10,15,20,25,30,35,40,45,50,100]
results_mae_user_sample_1 = {}
for k in k_values:
    nearest_neighbors_model = NearestNeighbors(n_neighbors=k, metric='precomputed')
    nearest_neighbors_model.fit(distance_matrix.values)
    result = run_mean_absolute_error_calculation(
        user_sample, nearest_neighbors_model)
    print("Mean absolute error with k = ",k ,"and num_users =", 
          len(user_sample),":", result)
    results_mae_user_sample_1[k] = result

In [None]:
# Used in MAP and MAR experiment 1 and 2
# Calculated MAP and MAR on recommendations based on all items
# Experiments are performed using random_seed 0 and 1
random_seed = 0
#random_seed = 1
user_sample = get_user_sample(random_seed, 1000)
k_values = [5,10, 20, 30] 
topN = 10
results_map_user_sample_0 = {}
results_mar_user_sample_0 = {}
for k in k_values:
    nearest_neighbors_model = NearestNeighbors(n_neighbors=k, 
                                          metric='precomputed')
    nearest_neighbors_model.fit(distance_matrix.values)
    result_map, result_mar = run_map_mar_calculation(user_sample, 
                                    nearest_neighbors_model, topN)
    print("Mean Average Precision with k = ",k , "and topN =", 
          topN, "and num_users =", len(user_sample),":", result_map)
    results_map_user_sample_0[k] = result_map
    print("Mean Average Recall with k = ",k ,"and num_users =", 
          len(user_sample),":", result_mar)
    results_mar_user_sample_0[k] = result_mar

In [None]:
# Used in MAP and MAR experiment 3 and 4
# Calculated MAP and MAR on recommendations based on items with known preferences
# Experiments are performed using random_seed 0 and 1
random_seed = 0
#random_seed = 1
user_sample = get_user_sample(random_seed, 1000)
k_values = [5,10, 20, 30] 
topN = 10
results_map_user_sample_01 = {}
results_mar_user_sample_01 = {}
for k in k_values:
    nearest_neighbors_model = NearestNeighbors(n_neighbors=k, metric='precomputed')
    nearest_neighbors_model.fit(distance_matrix.values)
    result_map, result_mar = run_map_mar_items_test_set(user_sample, nearest_neighbors_model, topN)
    print("Mean Average Precision with k = ",k , "and topN =", topN, "and num_users =", 
          len(user_sample),":", result_map)
    results_map_user_sample_01[k] = result_map
    print("Mean Average Recall with k = ",k ,"and num_users =", 
          len(user_sample),":", result_mar)
    results_mar_user_sample_01[k] = result_mar