### Libs

In [3]:
import pandas as pd
import numpy as np
import ast
import requests
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [4]:
pd.set_option('display.max_colwidth', 150)

In [5]:
# df_memory_based_ndcg = pd.read_csv('../../data/df_memory_based_ndcg.csv')
# df_memory_based_ndcg_lower_threshold = pd.read_csv('../../data/df_memory_based_ndcg_lower_threshold.csv')
# df_memory_based_precision = pd.read_csv('../../data/df_memory_based_precision.csv')
# df_memory_based_precision_lower_threshold = pd.read_csv('../../data/df_memory_based_precision_lower_threshold.csv')

### To run in Colab

In [2]:
# To work on google colab
# Connect
from google.colab import drive
drive.mount('/content/drive')

# Move to work folder location
%cd /content/drive/MyDrive/Capstone/llm_steam_bot/src/recommender

# Check location
!ls

ModuleNotFoundError: No module named 'google.colab'

### Upload files and transform data

In [6]:
# To filter only the top 100 most played games, for first implementation and faster processing
#df_top_100_game_reviews = pd.read_csv('../../data/top_100_game_reviews.gz', compression='gzip')
df_top_100_game_details = pd.read_csv('../../data/top_100_game_details.csv')
# To filter the top 1000 most played games, for full implementation
#df_top_1000_game_reviews = pd.read_csv('../../data/top_100_game_reviews.gz', compression='gzip')
df_top_1000_game_details = pd.read_csv('../../data/top_1000_game_details.csv')
# Users data
df_users_profile = pd.read_csv('../../data/users_profile.csv')
df_users_friendlist = pd.read_csv('../../data/users_friendlist.csv')
df_users_owned_games = pd.read_csv('../../data/users_owned_games.csv') # users of the 1000 game reviews
df_users_recently_played_games = pd.read_csv('../../data/users_recently_played_games.csv')

In [7]:
# Add our steam ids for tests
key = "06273B6FCBD23D875865DFC7E0EAFC06"
list_team_steamids = [76561198080989870, 76561199062172023, 76561198164574454]
users_owned_games = []
for steamid in list_team_steamids:
    dict_users_owned_games = {}
    url = f"http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key={key}&steamid={steamid}&include_appinfo=true&include_extended_appinfo=true&format=json"
    response = requests.get(url)
    dict_users_owned_games['user_steamid'] =  steamid
    try: # in case of no owned games
        dict_users_owned_games['user_owned_games'] =  response.json()['response']['games']
        users_owned_games.append(dict_users_owned_games)
    except:
        pass

# Concat in users df
users_owned_games
df_users_owned_games_team = pd.DataFrame(users_owned_games)
df_users_owned_games = pd.concat([df_users_owned_games,df_users_owned_games_team])

In [9]:
# Transform the user_owned_games JSON-like column
def safe_parse_owned_games(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except Exception as e:
            return None
    return val  # If it's already a list/dict, just return as is

df_users_owned_games["parsed_owned_games"] = df_users_owned_games["user_owned_games"].apply(safe_parse_owned_games)

In [None]:
#df_users_owned_games.to_csv('../../data/df_users_owned_games.csv', index=True)

In [10]:
# Flatten the data into rows of (user, appid, playtime)
rows = []
for _, row in df_users_owned_games.iterrows():
    user_id = row["user_steamid"]
    for game in row["parsed_owned_games"]:
        appid = game.get("appid")
        playtime = game.get("playtime_forever", 0)
        if appid is not None:
            rows.append((user_id, appid, playtime))

In [11]:
# Create a DataFrame from the flattened rows
interaction_df = pd.DataFrame(rows, columns=["user_steamid", "appid", "playtime_forever"])

In [13]:
interaction_df.head()

Unnamed: 0,user_steamid,appid,playtime_forever
0,76561198974520522,10,1
1,76561198974520522,80,0
2,76561198974520522,100,434
3,76561198974520522,300,1
4,76561198974520522,20,82


In [None]:
#interaction_df.to_csv('../../data/df_memory_based_interaction_1000_games.csv', index=True)

### Functions to calculate similarity and recommendation

In [None]:
# Get item similarity memory based on defined threshold
def get_item_similarity(interaction, threshold=None):
    if threshold==None:
        interaction["interaction"] = interaction["playtime_forever"]
    # Binarize playtime as implicit feedback (1 if played > x minutes)
    else:
        interaction["interaction"] = (interaction["playtime_forever"] >= threshold).astype(int)
    # Pivot to a user-item matrix
    user_item_matrix = interaction.pivot_table(
        index="user_steamid", columns="appid", values="interaction", fill_value=0
    )
    # Compute item-item similarity using cosine similarity
    item_user_matrix = user_item_matrix.T  # Transpose to get items x users
    item_similarity = cosine_similarity(item_user_matrix)
    # Build item similarity DataFrame
    item_similarity_df = pd.DataFrame(
        item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index
    )
    return user_item_matrix, item_similarity_df

# Get recommendation for user
def recommend_games_for_user(user_id, interaction, user_item_matrix, item_similarity_df, df_top_n_game_details, df_users_owned_games, show_output=False, top_n=5):
    # Get the user’s interaction vector
    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not found.")
        return []
    # Filter played used games
    played_games = interaction[interaction["user_steamid"] == user_id]["appid"].unique()
    # Store recommendation scores
    scores = pd.Series(dtype=float)
    for game in played_games:
        similar_scores = item_similarity_df[game]
        scores = scores.add(similar_scores, fill_value=0)
    # Remove all games the user ever played, not just liked ones
    scores = scores.drop(labels=played_games, errors="ignore")
    # Get top N recommendations
    top_recommendations = scores.sort_values(ascending=False).head(top_n)
    top_recommendations = pd.DataFrame(top_recommendations).reset_index().rename(columns={0: 'similarity_score'})
    top_recommendations = pd.merge(top_recommendations, df_top_n_game_details, on='appid')
    top_recommendations = top_recommendations[['appid','name','similarity_score']]

    if show_output==True:
      print(f'-----------------------\nUser: {user_id}')
      # Print played games
      games = list(df_users_owned_games[df_users_owned_games['user_steamid']==user_id]['parsed_owned_games'])[0]
      games_sorted = sorted(games, key=lambda x: x['playtime_forever'], reverse=True)
      print('---\nUser most played games:')
      for game in games_sorted[:5]:
          print(f"- {game['name']}")
      # Print Recommendations
      print("---\nTop Recommended Games:")
      print(top_recommendations)
      print("\n")
    pass
    return top_recommendations

### First implementation: Filtering only top 100 most played games

**Tests with different thresholds to consider a player liked a games**:
- No threshold, raw played time
- More than 10 horas played (600 min)
- More than 20 horas played (1200 min)
- More than 30 horas played (1800 min)

In [16]:
# Filter only top 100 played games before sparse matrix for first evaluation and faster processing
top_100_games = list(df_top_100_game_details['appid'])
interaction_df_100 = interaction_df[interaction_df['appid'].isin(top_100_games)]

# Define threshold
threshold=None
threshold=1200
threshold=1600
threshold=1800

# Make user_item_matrix and similarity
user_item_matrix, item_similarity_df = get_item_similarity(interaction_df_100, threshold=threshold)

# Get recomendations
recommendations = recommend_games_for_user(76561198080989870, interaction_df_100, user_item_matrix, item_similarity_df, df_top_100_game_details, df_users_owned_games, top_n=5, show_output=True)
recommendations = recommend_games_for_user(76561198164574454, interaction_df_100, user_item_matrix, item_similarity_df, df_top_100_game_details, df_users_owned_games, top_n=5)
recommendations = recommend_games_for_user(76561199062172023, interaction_df_100, user_item_matrix, item_similarity_df, df_top_100_game_details, df_users_owned_games, top_n=5)

-----------------------
User: 76561198080989870
---
User most played games:
- Red Dead Redemption 2
- The Elder Scrolls V: Skyrim
- Counter-Strike 2
- Grand Theft Auto V Legacy
- The Elder Scrolls V: Skyrim Special Edition
---
Top Recommended Games:
     appid                               name  similarity_score
0   578080                PUBG: BATTLEGROUNDS          1.269749
1  1091500                     Cyberpunk 2077          1.265525
2     4000                        Garry's Mod          1.257913
3   359550  Tom Clancy's Rainbow Six® Siege X          1.243093
4   218620                           PAYDAY 2          1.184105




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction["interaction"] = (interaction["playtime_forever"] >= threshold).astype(int)


### Full implementation: Filtering top 1000 most played games

**Tests with different thresholds to consider a player liked a games**:
- No threshold, raw played time
- More than 10 horas played (600 min)
- More than 20 horas played (1200 min)
- More than 30 horas played (1800 min)

In [17]:
# Filter top 1000 most played games, for full implementation
top_1000_games = list(df_top_1000_game_details['appid'])
interaction_df_1000 = interaction_df[interaction_df['appid'].isin(top_1000_games)]

# Define threshold
threshold=None
threshold=1200
threshold=1600
threshold=1800

# Make user_item_matrix and similarity
user_item_matrix, item_similarity_df = get_item_similarity(interaction_df_1000, threshold=threshold)

# Get recomendations
recommendations = recommend_games_for_user(76561198080989870, interaction_df_1000, user_item_matrix, item_similarity_df, df_top_1000_game_details, df_users_owned_games, top_n=10, show_output=True)
recommendations = recommend_games_for_user(76561198164574454, interaction_df_1000, user_item_matrix, item_similarity_df, df_top_1000_game_details, df_users_owned_games, top_n=5)
recommendations = recommend_games_for_user(76561199062172023, interaction_df_1000, user_item_matrix, item_similarity_df, df_top_1000_game_details, df_users_owned_games, top_n=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction["interaction"] = (interaction["playtime_forever"] >= threshold).astype(int)


-----------------------
User: 76561198080989870
---
User most played games:
- Red Dead Redemption 2
- The Elder Scrolls V: Skyrim
- Counter-Strike 2
- Grand Theft Auto V Legacy
- The Elder Scrolls V: Skyrim Special Edition
---
Top Recommended Games:
    appid                               name  similarity_score
0  377160                          Fallout 4          1.990360
1    4000                        Garry's Mod          1.808666
2  105600                           Terraria          1.804034
3  218620                           PAYDAY 2          1.772381
4   49520                      Borderlands 2          1.768351
5  292030           The Witcher 3: Wild Hunt          1.727495
6  578080                PUBG: BATTLEGROUNDS          1.672340
7  359550  Tom Clancy's Rainbow Six® Siege X          1.651508
8  230410                           Warframe          1.646363
9     550                      Left 4 Dead 2          1.639346




In [None]:
# user_item_matrix.to_csv('../../data/memory_based_user_item_matrix_1000_games.csv', index=True)
# item_similarity_df.to_csv('../../data/df_memory_based_item_similarity_1000_games.csv', index=True)
# interaction_df_1000.to_csv('../../data/df_memory_based_interaction_1000_games.csv', index=True)

### Offline evaluation

In [18]:
from collections import defaultdict

In [19]:
# Split into train and test (for offline evaluation) and make train interaction_df
user_game_dict = defaultdict(list)
for _, row in interaction_df.iterrows():
    user_game_dict[row['user_steamid']].append(row['appid'])

train = {}
test = {}
rng = np.random.default_rng(42)

for user, games in user_game_dict.items():
    if len(games) < 2:
        continue  # Skip users with fewer than 2 relevant games
    games = rng.permutation(games)
    split = int(len(games) * 0.7)
    train[user] = list(games[:split])
    test[user] = list(games[split:])

print(f"Total users with train/test split: {len(train)}")


interaction_df_train = interaction_df[
    interaction_df["user_steamid"].isin(train) &
    interaction_df.apply(lambda row: row["appid"] in train.get(row["user_steamid"], []), axis=1)
]

Total users with train/test split: 4334


In [20]:
# Filter only top 100 played games before sparse matrix for first evaluation and faster processing (offline evaluation)
top_100_games = list(df_top_100_game_details['appid'])
interaction_df_100_train = interaction_df_train[interaction_df_train['appid'].isin(top_100_games)]

# Filter top 1000 most played games, for full implementation
top_1000_games = list(df_top_1000_game_details['appid'])
interaction_df_1000_train = interaction_df_train[interaction_df_train['appid'].isin(top_1000_games)]

#### Hit Rate@k

In [None]:
# Compare for each user in test, if at least one played game is recommended
hit_rate = []
total_games = 100
#for interaction_df_train in [interaction_df_100_train, interaction_df_1000_train]: # 100 most played games and 1000 most played
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 150, 300, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    user_item_matrix_train, item_similarity_df_train = get_item_similarity(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      hits = 0
      for user in test:
          true_items = set(test[user])
          try:
            recommended = list(recommend_games_for_user(user, interaction_df_train, user_item_matrix_train, item_similarity_df_train, df_top_game_details, df_users_owned_games, top_n=k)['appid'])
          except:
            continue
            #print('error due to not fiding test user in train')
          if set(recommended) & true_items:
              hits += 1
      score = hits / len(test)
      hit_rate_dict = {}
      hit_rate_dict[f"total_games"] = total_games
      hit_rate_dict[f"threshold"] = threshold
      hit_rate_dict[f"k"] = k
      hit_rate_dict["hit_rate"] = score
      hit_rate.append(hit_rate_dict)
  total_games = 1000

# Convert to DataFrame
df_memory_based_hit_rate = pd.DataFrame(hit_rate)
df_memory_based_hit_rate["threshold"] = df_memory_based_hit_rate["threshold"].astype(str)
# Save .csv
df_memory_based_hit_rate.to_csv('../../data/df_memory_based_hit_rate.csv', index=False)

#### Precision@k

In [22]:
# Unlike Hit Rate, which gives 1 point if any relevant item is found, Precision rewards more for getting multiple relevant items right.
def precision_at_k(recommended_items, true_items, k):
    recommended_top_k = recommended_items[:k]
    if not recommended_top_k:
        return 0.0
    hits = len(set(recommended_top_k) & set(true_items))
    return hits / k

In [None]:
precision_results = []
total_games = 100
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 150, 300, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    user_item_matrix_train, item_similarity_df_train = get_item_similarity(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      precisions = []
      for user in test:
          true_items = set(test[user])
          try:
              recs_df = recommend_games_for_user(user, interaction_df_train, user_item_matrix_train, item_similarity_df_train, df_top_game_details, df_users_owned_games, top_n=k)
              recommended = list(recs_df["appid"])
          except:
              continue
          prec = precision_at_k(recommended, true_items, k)
          precisions.append(prec)
      precision_score = np.mean(precisions)
      precision_results.append({
          "total_games": total_games,
          "threshold": threshold,
          "k": k,
          "precision": precision_score
      })
  total_games = 1000

# Convert to DataFrame
df_memory_based_precision = pd.DataFrame(precision_results)
df_memory_based_precision["threshold"] = df_memory_based_precision["threshold"].astype(str)
# Save .csv
df_memory_based_precision.to_csv('../../data/df_memory_based_precision.csv', index=False)

#### NDCG@k

In [24]:
# evaluates not just what relevant items are recommended, but also where they appear in the ranked list. It gives higher scores for relevant items that appear earlier in the list

In [25]:
def ndcg_at_k(recommended_items, true_items, k):
    recommended_top_k = recommended_items[:k]
    dcg = 0.0
    for i, item in enumerate(recommended_top_k):
        if item in true_items:
            dcg += 1 / np.log2(i + 2)  # i+2 because log2(1) = 0
    # Ideal DCG
    ideal_hits = min(len(true_items), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(ideal_hits))
    return dcg / idcg if idcg > 0 else 0.0

In [None]:
ndcg_results = []
total_games = 100
for interaction_df_train, df_top_game_details in [(interaction_df_100_train, df_top_100_game_details),(interaction_df_1000_train, df_top_1000_game_details)]: # 100 most played games and 1000 most played
  print("starting for interaction_df_train")
  for threshold in [None, 150, 300, 600, 1200, 1800]:
    print(f"starting for threshold: {threshold}")
    # Make similarity
    user_item_matrix_train, item_similarity_df_train = get_item_similarity(interaction_df_train, threshold=threshold)
    for k in [1,5,10,20]: # multiples k's
      print(f"starting for k: {k}")
      ndcgs = []
      for user in test:
          true_items = set(test[user])
          try:
              recs_df = recommend_games_for_user(user, interaction_df_train, user_item_matrix_train, item_similarity_df_train, df_top_game_details, df_users_owned_games, top_n=k)
              recommended = list(recs_df["appid"])
          except:
              continue
          ndcg = ndcg_at_k(recommended, true_items, k)
          ndcgs.append(ndcg)
      ndcg_score = np.mean(ndcgs)
      ndcg_results.append({
          "total_games": total_games,
          "threshold": threshold,
          "k": k,
          "ndcg": ndcg_score
      })
  total_games = 1000

# Convert to DataFrame
df_memory_based_ndcg = pd.DataFrame(ndcg_results)
df_memory_based_ndcg["threshold"] = df_memory_based_ndcg["threshold"].astype(str)
# Save .csv
df_memory_based_ndcg.to_csv('../../data/df_memory_based_ndcg.csv', index=False)

### Make static recommendation df to use in app

In [42]:
users = user_item_matrix.index.tolist()
df__memory_based_recommendations = pd.DataFrame(columns=['user_id','appid','name','similarity_score'])
#recommendations = []
for user in users:
    df_recommendation = recommend_games_for_user(user, interaction_df_1000, user_item_matrix, item_similarity_df, df_top_1000_game_details, df_users_owned_games, top_n=10, show_output=False)
    df_recommendation['user_id'] = user
    df__memory_based_recommendations = pd.concat([df__memory_based_recommendations, df_recommendation])

df_memory_based_recommendations = df__memory_based_recommendations.reset_index(drop=True)
df_memory_based_recommendations.to_csv('../../data/df_memory_based_recommendations.csv', index=False)

  df__memory_based_recommendations = pd.concat([df__memory_based_recommendations, df_recommendation])


In [43]:
df_memory_based_recommendations

Unnamed: 0,user_id,appid,name,similarity_score
0,76561197960281451,271590,Grand Theft Auto V Legacy,43.344413
1,76561197960281451,489830,The Elder Scrolls V: Skyrim Special Edition,38.050557
2,76561197960281451,553850,HELLDIVERS™ 2,35.324982
3,76561197960281451,20920,The Witcher 2: Assassins of Kings Enhanced Edition,35.109585
4,76561197960281451,374320,DARK SOULS™ III,34.993592
...,...,...,...,...
43335,76561199862430366,218620,PAYDAY 2,1.757663
43336,76561199862430366,4000,Garry's Mod,1.731645
43337,76561199862430366,22380,Fallout: New Vegas,1.661827
43338,76561199862430366,292030,The Witcher 3: Wild Hunt,1.661073


In [44]:
df_memory_based_recommendations = pd.read_csv("../../data/df_memory_based_recommendations.csv")