### Libs

In [1]:
import pandas as pd
import numpy as np
import ast
import requests
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [2]:
# To work on google colab
# Connect
from google.colab import drive
drive.mount('/content/drive')

# Move to work folder location
%cd /content/drive/MyDrive/Capstone/llm_steam_bot/src/recommender

# Check location
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Capstone/llm_steam_bot/src/recommender
collaborative_matrix_based.ipynb    offline_eval_results_20250723_172221.csv
collaborative_memory_based.ipynb    offline_evaluation.py
content_based_recs.py		    Untitled0.ipynb
explore_offline_eval_results.ipynb


In [5]:
pd.set_option('display.max_colwidth', 150)

### Upload files and transform data

In [6]:
# To filter only the top 100 most played games, for first implementation and faster processing
#df_top_100_game_reviews = pd.read_csv('../../data/top_100_game_reviews.gz', compression='gzip')
df_top_100_game_details = pd.read_csv('../../data/top_100_game_details.csv')
# To filter the top 1000 most played games, for full implementation
#df_top_1000_game_reviews = pd.read_csv('../../data/top_100_game_reviews.gz', compression='gzip')
df_top_1000_game_details = pd.read_csv('../../data/top_1000_game_details.csv')
# Users data
df_users_profile = pd.read_csv('../../data/users_profile.csv')
df_users_friendlist = pd.read_csv('../../data/users_friendlist.csv')
df_users_owned_games = pd.read_csv('../../data/users_owned_games.csv') # users of the 1000 game reviews
df_users_recently_played_games = pd.read_csv('../../data/users_recently_played_games.csv')

In [7]:
# Add our steam ids for tests
key = "06273B6FCBD23D875865DFC7E0EAFC06"
list_team_steamids = [76561198080989870, 76561199062172023, 76561198164574454]
users_owned_games = []
for steamid in list_team_steamids:
    dict_users_owned_games = {}
    url = f"http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key={key}&steamid={steamid}&include_appinfo=true&include_extended_appinfo=true&format=json"
    response = requests.get(url)
    dict_users_owned_games['user_steamid'] =  steamid
    try: # in case of no owned games
        dict_users_owned_games['user_owned_games'] =  response.json()['response']['games']
        users_owned_games.append(dict_users_owned_games)
    except:
        pass

# Concat in users df
users_owned_games
df_users_owned_games_team = pd.DataFrame(users_owned_games)
df_users_owned_games = pd.concat([df_users_owned_games,df_users_owned_games_team])

In [8]:
df_users_owned_games.tail(3)

Unnamed: 0,user_steamid,user_owned_games
0,76561198080989870,"[{'appid': 72850, 'name': 'The Elder Scrolls V: Skyrim', 'playtime_forever': 3889, 'img_icon_url': 'b9aca8a189abd8d6aaf09047dbb0f57582683e1c', 'ha..."
1,76561199062172023,"[{'appid': 322500, 'name': 'SUPERHOT', 'playtime_forever': 408, 'img_icon_url': '81839f0d50cb3e54c9aa7c69c04916f1e53d8c35', 'has_community_visible..."
2,76561198164574454,"[{'appid': 32470, 'name': 'STAR WARS™ Empire at War: Gold Pack', 'playtime_forever': 320, 'img_icon_url': '3f65d9be3af3083c07f1053dbf0b0653af7323b..."


In [9]:
# Transform the user_owned_games JSON-like column
def safe_parse_owned_games(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except Exception as e:
            return None
    return val  # If it's already a list/dict, just return as is

df_users_owned_games["parsed_owned_games"] = df_users_owned_games["user_owned_games"].apply(safe_parse_owned_games)

In [10]:
# Flatten the data into rows of (user, appid, playtime)
rows = []
for _, row in df_users_owned_games.iterrows():
    user_id = row["user_steamid"]
    for game in row["parsed_owned_games"]:
        appid = game.get("appid")
        playtime = game.get("playtime_forever", 0)
        if appid is not None:
            rows.append((user_id, appid, playtime))

In [11]:
# Create a DataFrame from the flattened rows
interaction_df = pd.DataFrame(rows, columns=["user_steamid", "appid", "playtime_forever"])

### Functions to calculate similarity and recommendation

In [36]:
# Get item similarity matrix based on defined threshold
def get_predicted_df_matrix_based(interaction, threshold=None):
    if threshold==None:
        interaction["interaction"] = interaction["playtime_forever"]
    # Binarize playtime as implicit feedback (1 if played > x minutes)
    elif threshold==600:
        interaction["interaction"] = (interaction["playtime_forever"] >= 600).astype(int)
    elif threshold==1200:
        interaction["interaction"] = (interaction["playtime_forever"] >= 1200).astype(int)
    elif threshold==1800:
        interaction["interaction"] = (interaction["playtime_forever"] >= 1800).astype(int)
    # Pivot to a user-item matrix
    user_item_matrix = interaction.pivot_table(index="user_steamid", columns="appid", values="interaction", fill_value=0)
    # Apply Truncated SVD to reduce dimensions
    n_components = 80  # Number of latent factors
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    # Matrix
    user_latent_matrix = svd.fit_transform(user_item_matrix)
    item_latent_matrix = svd.components_.T  # transpose to get items x latent dimensions
    # Compute predicted interaction matrix
    predicted_ratings = user_latent_matrix @ item_latent_matrix.T  # matrix multiplication
    # Convert predicted scores into a DataFrame for easy lookup
    predicted_df = pd.DataFrame(predicted_ratings,index=user_item_matrix.index,columns=user_item_matrix.columns)
    return predicted_df

def recommend_games_matrix_based(user_id, interaction, predicted, df_game_details, show_output=False, top_n=5):
    if user_id not in predicted.index:
        print(f"User {user_id} not found.")
        return []
    # Predicted scores for this user
    user_scores = predicted.loc[user_id].copy()
    # Remove already played games
    played_games = interaction[interaction["user_steamid"] == user_id]["appid"].unique()
    user_scores = user_scores.drop(labels=played_games, errors='ignore')
    # Get top N predictions
    top_recommendations = user_scores.sort_values(ascending=False).head(top_n).reset_index()
    top_recommendations.columns = ['appid', 'similarity_score']
    # Merge with game metadata
    top_recommendations = pd.merge(top_recommendations, df_game_details, on='appid')
    top_recommendations = top_recommendations[['appid', 'name', 'similarity_score']]

    if show_output==True:
      ### Prints for visualization
      print(f'-----------------------\nUser: {user_id}')
      # Print played games
      games = list(df_users_owned_games[df_users_owned_games['user_steamid']==user_id]['parsed_owned_games'])[0]
      games_sorted = sorted(games, key=lambda x: x['playtime_forever'], reverse=True)
      print('---\nUser most played games:')
      for game in games_sorted[:5]:
          print(f"- {game['name']}")
      # Print Recommendations
      print("---\nTop Recommended Games:")
      print(top_recommendations)
      print("\n")
    pass
    return top_recommendations


### First implementation: Filtering only top 100 most played games

Tests with different thresholds to consider a player liked a games:

- No threshold, raw played time
- More than 10 horas played (600 min)
- More than 20 horas played (1200 min)
- More than 30 horas played (1800 min)

In [13]:
# Filter only top 100 played games before sparse matrix for first evaluation and faster processing
top_100_games = list(df_top_100_game_details['appid'])
interaction_df_100 = interaction_df[interaction_df['appid'].isin(top_100_games)]

# Define threshold
threshold=None
threshold=1200
threshold=1600
threshold=1800

# Make predicted df
predicted_df = get_predicted_df_matrix_based(interaction_df_100, threshold=threshold)

# Get recomendations
recommendations = recommend_games_matrix_based(76561198080989870, interaction_df_100, predicted_df, df_top_100_game_details, show_output=True, top_n=5)
recommendations = recommend_games_matrix_based(76561198164574454, interaction_df_100, predicted_df, df_top_100_game_details, top_n=5)
recommendations = recommend_games_matrix_based(76561199062172023, interaction_df_100, predicted_df, df_top_100_game_details, top_n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction["interaction"] = (interaction["playtime_forever"] >= 1800).astype(int)


-----------------------
User: 76561198080989870
---
User most played games:
- Red Dead Redemption 2
- The Elder Scrolls V: Skyrim
- Counter-Strike 2
- Grand Theft Auto V Legacy
- The Elder Scrolls V: Skyrim Special Edition
---
Top Recommended Games:
     appid                name  similarity_score
0  1811260  EA SPORTS™ FIFA 23          0.046560
1   550650         Black Squad          0.016119
2  1665460          eFootball™          0.006864
3   291480     Warface: Clutch          0.005123
4       70           Half-Life          0.004724




### Full implementation: Filtering top 1000 most played games

Tests with different thresholds to consider a player liked a games:

- No threshold, raw played time
- More than 10 horas played (600 min)
- More than 20 horas played (1200 min)
- More than 30 horas played (1800 min)

In [None]:
# Filter top 1000 most played games, for full implementation
top_1000_games = list(df_top_1000_game_details['appid'])
interaction_df_1000 = interaction_df[interaction_df['appid'].isin(top_1000_games)]

# Define threshold
threshold=None
threshold=1200
threshold=1600
threshold=1800

# Make predicted df
predicted_df = get_predicted_df_matrix_based(interaction_df_1000, threshold=threshold)

# Get recomendations
recommendations = recommend_games_matrix_based(76561198080989870, interaction_df_1000, predicted_df, df_top_1000_game_details, show_output=True, top_n=5)
recommendations = recommend_games_matrix_based(76561198164574454, interaction_df_1000, predicted_df, df_top_1000_game_details, top_n=5)
recommendations = recommend_games_matrix_based(76561199062172023, interaction_df_1000, predicted_df, df_top_1000_game_details, top_n=5)

### Offline evaluation

In [15]:
from collections import defaultdict

In [16]:
# Split into train and test (for offline evaluation) and make train interaction_df
user_game_dict = defaultdict(list)
for _, row in interaction_df.iterrows():
    user_game_dict[row['user_steamid']].append(row['appid'])

train = {}
test = {}
rng = np.random.default_rng(42)

for user, games in user_game_dict.items():
    if len(games) < 2:
        continue  # Skip users with fewer than 2 relevant games
    games = rng.permutation(games)
    split = int(len(games) * 0.7)
    train[user] = list(games[:split])
    test[user] = list(games[split:])

print(f"Total users with train/test split: {len(train)}")

interaction_df_train = interaction_df[
    interaction_df["user_steamid"].isin(train) &
    interaction_df.apply(lambda row: row["appid"] in train.get(row["user_steamid"], []), axis=1)
]

Total users with train/test split: 4334


In [17]:
# Filter only top 100 played games before sparse matrix for first evaluation and faster processing (offline evaluation)
top_100_games = list(df_top_100_game_details['appid'])
interaction_df_100_train = interaction_df_train[interaction_df_train['appid'].isin(top_100_games)]

# Filter top 1000 most played games, for full implementation
top_1000_games = list(df_top_1000_game_details['appid'])
interaction_df_1000_train = interaction_df_train[interaction_df_train['appid'].isin(top_1000_games)]

#### Hit Rate@k

In [37]:
# Compute Hit Rate@k
# Compare for each user in test, if at least one played game is recommended

hit_rate = []
total_games = 100
#for interaction_df_train in [interaction_df_100_train, interaction_df_1000_train]: # 100 most played games and 1000 most played
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    predicted_df_train = get_predicted_df_matrix_based(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      hits = 0
      for user in test:
          true_items = set(test[user])
          try:
            recommended = list(recommend_games_matrix_based(user, interaction_df_train, predicted_df_train, df_top_game_details, top_n=k)['appid'])
          except:
            continue
            #print('error due to not fiding test user in train')
          if set(recommended) & true_items:
              hits += 1
      score = hits / len(test)
      hit_rate_dict = {}
      hit_rate_dict[f"total_games"] = total_games
      hit_rate_dict[f"threshold"] = threshold
      hit_rate_dict[f"k"] = k
      hit_rate_dict["hit_rate"] = score
      hit_rate.append(hit_rate_dict)
  total_games = 1000

# Convert to DataFrame
df_matrix_factorization_hit_rate = pd.DataFrame(hit_rate)
df_matrix_factorization_hit_rate["threshold"] = df_matrix_factorization_hit_rate["threshold"].astype(str)
# Save .csv
df_matrix_factorization_hit_rate.to_csv('../../data/df_matrix_factorization_hit_rate.csv', index=False)

starting for interaction_df_train
starting for threshold: None
starting for k: 1
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 5
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 10
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 20
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198

#### Precision@k

In [39]:
# Unlike Hit Rate, which gives 1 point if any relevant item is found, Precision rewards more for getting multiple relevant items right.
def precision_at_k(recommended_items, true_items, k):
    recommended_top_k = recommended_items[:k]
    if not recommended_top_k:
        return 0.0
    hits = len(set(recommended_top_k) & set(true_items))
    return hits / k

precision_results = []
total_games = 100
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    predicted_df_train = get_predicted_df_matrix_based(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      precisions = []
      for user in test:
          true_items = set(test[user])
          try:
              recs_df = recommend_games_matrix_based(user, interaction_df_train, predicted_df_train, df_top_game_details, top_n=k)
              recommended = list(recs_df["appid"])
          except:
              continue
          prec = precision_at_k(recommended, true_items, k)
          precisions.append(prec)
      precision_score = np.mean(precisions)
      precision_results.append({
          "total_games": total_games,
          "threshold": threshold,
          "k": k,
          "precision": precision_score
      })
  total_games = 1000

# Convert to DataFrame
df_matrix_factorization_precision = pd.DataFrame(precision_results)
df_matrix_factorization_precision["threshold"] = df_matrix_factorization_precision["threshold"].astype(str)
# Save .csv
df_matrix_factorization_precision.to_csv('../../data/df_matrix_factorization_precision.csv', index=False)

starting for interaction_df_train
starting for threshold: None
starting for k: 1
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 5
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 10
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 20
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198

#### NDCG@k

In [42]:
# evaluates not just what relevant items are recommended, but also where they appear in the ranked list. It gives higher scores for relevant items that appear earlier in the list
def ndcg_at_k(recommended_items, true_items, k):
    recommended_top_k = recommended_items[:k]
    dcg = 0.0
    for i, item in enumerate(recommended_top_k):
        if item in true_items:
            dcg += 1 / np.log2(i + 2)  # i+2 because log2(1) = 0
    # Ideal DCG
    ideal_hits = min(len(true_items), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(ideal_hits))
    return dcg / idcg if idcg > 0 else 0.0

ndcg_results = []
total_games = 100
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    predicted_df_train = get_predicted_df_matrix_based(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      ndcgs = []
      for user in test:
          true_items = set(test[user])
          try:
              recs_df = recommend_games_matrix_based(user, interaction_df_train, predicted_df_train, df_top_game_details, top_n=k)
              recommended = list(recs_df["appid"])
          except:
              continue
          ndcg = ndcg_at_k(recommended, true_items, k)
          ndcgs.append(ndcg)
      ndcg_score = np.mean(ndcgs)
      ndcg_results.append({
          "total_games": total_games,
          "threshold": threshold,
          "k": k,
          "ndcg": ndcg_score
      })
  total_games = 1000

# Convert to DataFrame
df_matrix_factorization_ndcg = pd.DataFrame(ndcg_results)
df_matrix_factorization_ndcg["threshold"] = df_matrix_factorization_ndcg["threshold"].astype(str)
# Save .csv
df_matrix_factorization_ndcg.to_csv('../../data/df_matrix_factorization_ndcg.csv', index=False)

starting for interaction_df_train
starting for threshold: None
starting for k: 1
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 5
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 10
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198022201098 not found.
User 76561198396866067 not found.
User 76561199154732495 not found.
starting for k: 20
User 76561199694195327 not found.
User 76561198088028945 not found.
User 76561199820126504 not found.
User 76561198152405530 not found.
User 76561198