In [None]:
# ============================================================
# üéÆ Steam Game Recommender System
# Content-Based Filtering using TF-IDF & Cosine Similarity
# ============================================================

# ‚îÄ‚îÄ STEP 1: Install & Import Libraries ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# kagglehub handles dataset downloading automatically
!pip install kagglehub -q

import kagglehub
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

print("‚úÖ Libraries loaded successfully!")


# ‚îÄ‚îÄ STEP 2: Download & Load the Dataset ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Uses kagglehub to automatically download the dataset.
# First time running: you'll be prompted to enter your Kaggle credentials.
# After that, the dataset is cached locally ‚Äî no re-download needed.

# Download latest version of the dataset
path = kagglehub.dataset_download("fronkongames/steam-games-dataset")
print("üì¶ Path to dataset files:", path)

# Find the CSV inside the downloaded folder
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]
if not csv_files:
    raise FileNotFoundError("No CSV file found in the downloaded dataset folder.")

CSV_PATH = os.path.join(path, csv_files[0])
print(f"üìÑ Using file: {csv_files[0]}")

df = pd.read_csv(CSV_PATH)
print(f"‚úÖ Dataset loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(df.head())
print()
df.info()


# ‚îÄ‚îÄ STEP 3: Preprocess the Data ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
COLUMNS = ["Name", "About the game", "Tags", "Genres", "Price"]

df_clean = (
    df[COLUMNS]
    .dropna()
    .reset_index(drop=True)
)

print(f"\n‚úÖ After dropping nulls: {len(df_clean):,} games remaining")
print(df_clean.head())


# ‚îÄ‚îÄ STEP 4: Build the TF-IDF Matrix ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df_clean["About the game"])

print(f"\n‚úÖ TF-IDF matrix shape: {tfidf_matrix.shape}")


# ‚îÄ‚îÄ STEP 5: Compute Cosine Similarity ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"‚úÖ Cosine similarity matrix shape: {cosine_sim.shape}")


# ‚îÄ‚îÄ STEP 6: Recommender Function ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Build a reverse-lookup Series: game name ‚Üí DataFrame index
indices = pd.Series(df_clean.index, index=df_clean["Name"])
indices = indices[~indices.index.duplicated(keep="last")]

def steam_game_recommender(title: str, top_n: int = 10) -> pd.DataFrame:
    """
    Recommend games similar to `title` based on description similarity.

    Parameters
    ----------
    title : str
        Exact game name as it appears in the dataset.
    top_n : int
        Number of recommendations to return (default 10).

    Returns
    -------
    pd.DataFrame
        Columns: name, genre, original_price, similarity_score
    """
    if title not in indices:
        close = df_clean[df_clean["Name"].str.contains(title, case=False, na=False)]["Name"].tolist()
        suggestion = f"  Did you mean: {close[:5]}" if close else ""
        raise ValueError(f"'{title}' not found in dataset.{suggestion}")

    game_idx = indices[title]
    scores = pd.Series(cosine_sim[game_idx], name="similarity_score")
    top_indices = scores.sort_values(ascending=False).iloc[1 : top_n + 1].index

    result = df_clean.loc[top_indices, ["Name", "Genres", "Price"]].copy()
    result["similarity_score"] = scores[top_indices].values
    result["similarity_score"] = result["similarity_score"].round(4)
    return result.reset_index(drop=True)


# ‚îÄ‚îÄ STEP 7: Run Example Recommendations ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
test_games = ["DayZ", "Street Fighter V", "DOOM"]

for game in test_games:
    try:
        recs = steam_game_recommender(game, top_n=10)
        print(f"\n{'='*55}")
        print(f"  üéÆ Top 10 games similar to: {game}")
        print(f"{'='*55}")
        print(recs.to_string(index=True))
    except ValueError as e:
        print(f"\n‚ö†Ô∏è  {e}")


# ‚îÄ‚îÄ STEP 8: Interactive Query (Optional) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Uncomment and run in Colab for a live search:
#
# game_title = input("Enter a game title: ")
# try:
#     print(steam_game_recommender(game_title, top_n=10))
# except ValueError as e:
#     print(e)

‚úÖ Libraries loaded successfully!
Using Colab cache for faster access to the 'steam-games-dataset' dataset.
üì¶ Path to dataset files: /kaggle/input/steam-games-dataset
üìÑ Using file: games.csv
‚úÖ Dataset loaded: 122,611 rows √ó 39 columns
                                         AppID          Name Release date  Estimated owners  Peak CCU  Required age  Price  DiscountDLC count                                     About the game                         Supported languages Full audio languages Reviews                                       Header image                                      Website                           Support url                Support email  Windows    Mac  Linux  Metacritic score Metacritic url  User score  Positive  Negative  Score rank  Achievements  Recommendations                                              Notes  Average playtime forever  Average playtime two weeks  Median playtime forever  Median playtime two weeks            Developers            Publi