the first half of the code explores the board game recommender system using tf-idf and standard scalar followed by cosine similarity. the second half explores the movie recommender system using surprise library

i used llm here to generate the code. my goal with this work is to understand the overview of how recommdender system in general and surprise library work, code structure, and develop broader understanding of the topic.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from surprise.dump import dump, load
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity
import random
import os
import urllib.request
from collections import defaultdict

sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

### board game recommender system

In [None]:
# 1. Import dataset
games = pd.read_csv('/Users/thyag/Desktop/codes/machine-learning-projects/datasets/bgg_dataset.csv', sep=';')

# 2. Convert all non-null IDs to integer
games['ID'] = pd.to_numeric(games['ID'], errors='coerce')

# 3. Fill missing IDs with unique random 6-digit integers
mask = games['ID'].isna()
num_missing = mask.sum()
existing_ids = set(games['ID'].dropna().astype(int))

new_ids = []
while len(new_ids) < num_missing:
    candidate = random.randint(100000, 999999)
    if candidate not in existing_ids:
        new_ids.append(candidate)
        existing_ids.add(candidate)

games.loc[mask, 'ID'] = new_ids

# 4. Convert all IDs to int and set as index
games['ID'] = games['ID'].astype(int)
games = games.set_index('ID', drop=False)

# 5. Convert 'Year Published' to numeric, filling NaNs with 0
games['Year Published'] = pd.to_numeric(games['Year Published'], errors='coerce').fillna(0).astype(int)

# 6. Replace comma with dot and convert to float for rating and complexity
for col in ['Rating Average', 'Complexity Average']:
    games[col] = games[col].str.replace(',', '.', regex=False).astype(float)

In [None]:
games.info()

In [None]:
games.head()

In [None]:
# create text features and numeric features

text_features = games['Mechanics'] + ',' + games['Domains']
numeric_features = games[['Min Players', 'Max Players', 'Play Time', 'Min Age', 'Complexity Average']]

# fill NaN values in text features
games['Mechanics'] = games['Mechanics'].fillna('')
games['Domains'] = games['Domains'].fillna('')
games['combined_text'] = games['Mechanics'] + ', ' + games['Domains']

#tf-idf for text features
tfidf = TfidfVectorizer()

#standard scaling for numeric features
scaler = StandardScaler()

#fit transformers
tfidf_matrix = tfidf.fit_transform(games['combined_text'])
scaled_numeric = scaler.fit_transform(numeric_features)

In [None]:
# concatenate tf-idf and scaled numeric features
combined_features = hstack([tfidf_matrix, scaled_numeric])

# perform cosine similarity
similarity = cosine_similarity(combined_features)

In [None]:
# Function to recommend games based on a given game index

def recommender(game_index, top_n=5):
    sim_scores = list(enumerate(similarity[game_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [i for i in sim_scores if i[0] != game_index]
    top_indices = [i[0] for i in sim_scores[:top_n]]
    return games.iloc[top_indices][['Name', 'Rating Average', 'Mechanics', 'Domains']]

In [None]:
recommender(0)

### movie recommender system 

In [None]:
# Ensure u.item is downloaded
item_url = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item'
item_local = 'u.item'
if not os.path.exists(item_local):
    urllib.request.urlretrieve(item_url, item_local)

# Load MovieLens 100k ratings
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

# Train SVD model
algo = SVD()
algo.fit(trainset)

# Predict for all unseen user-movie pairs
anti_testset = trainset.build_anti_testset()
predictions = algo.test(anti_testset)

# Organize predictions by user
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
    top_n[uid].append((iid, est))
for uid in top_n:
    top_n[uid] = sorted(top_n[uid], key=lambda x: x[1], reverse=True)[:10]

# Load movie titles from u.item
df_movies = pd.read_csv(
    item_local, sep='|', header=None, encoding='latin-1',
    usecols=[0, 1], names=['movie_id', 'title']
)
movie_id_to_title = dict(zip(df_movies.movie_id.astype(str), df_movies.title))

# Print top 10 recommendations for a user
user_id = '196'
recommended = top_n[user_id]
for movie_id, score in recommended:
    print(movie_id_to_title[movie_id], round(score, 2))
