In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from IPython.display import display

In [2]:
# Load dataset
df = pd.read_csv("steam.csv")

print("Original shape:", df.shape)
display(df.head())

Original shape: (27075, 18)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [3]:
# Basic cleaning
# Keep only English games (1 == English) if that column exists
if "english" in df.columns:
    df = df[df["english"] == 1].copy()

# Drop duplicate appids
if "appid" in df.columns:
    df = df.drop_duplicates(subset="appid").reset_index(drop=True)

# Ensure price is numeric if present
if "price" in df.columns:
    df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0.0)
else:
    # If price column doesn't exist, create it as 0
    df["price"] = 0.0

# Fill missing text fields (create if missing)
text_cols = ["genres", "steamspy_tags", "categories", "developer", "publisher"]
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].fillna("")
    else:
        df[col] = ""

# Create a combined text feature for content-based similarity
def combine_features(row):
    return "; ".join([
        str(row["genres"]),
        str(row["steamspy_tags"]),
        str(row["categories"]),
        str(row["developer"]),
        str(row["publisher"])
    ])

df["content"] = df.apply(combine_features, axis=1).str.lower()

# Cleaned game name (optional convenience)
df["name_clean"] = df["name"].astype(str).str.strip().str.lower()

print("Cleaned shape:", df.shape)
display(df.head())

Cleaned shape: (26564, 20)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,content,name_clean
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19,action; action;fps;multiplayer; multi-player;o...,counter-strike
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,team fortress classic
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99,action; fps;world war ii;multiplayer; multi-pl...,day of defeat
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,deathmatch classic
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99,action; fps;action;sci-fi; single-player;multi...,half-life: opposing force


In [4]:
# Build TF-IDF representation
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["content"])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Map appid -> index (row number)
appid_to_index = {}
if "appid" in df.columns:
    appid_to_index = pd.Series(df.index, index=df["appid"]).to_dict()
else:
    raise ValueError("Column 'appid' is required in the dataset for this system.")

TF-IDF matrix shape: (26564, 18297)


In [5]:
# Core similarity helper
def _get_similarity_series(idx):
    """
    Returns a pandas Series of cosine similarities between the game at index idx
    and all other games.
    """
    cosine_similarities = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
    return pd.Series(cosine_similarities, index=df.index)

In [6]:
# Helper: apply genre and price filters
def _apply_filters(base_df, genre=None, max_price=None, min_positive_ratings=0):
    """
    Filter the given DataFrame by optional genre, max_price, and minimum positive ratings.
    Works on ANY DataFrame that has at least 'genres', 'price', 'positive_ratings'.
    """
    filtered = base_df.copy()

    # Filter by price
    if max_price is not None and "price" in filtered.columns:
        filtered = filtered[filtered["price"] <= max_price]

    # Filter by genre (substring match in 'genres' column, case-insensitive)
    if genre and "genres" in filtered.columns:
        genre_lower = str(genre).lower()
        filtered = filtered[
            filtered["genres"].astype(str).str.lower().str.contains(genre_lower, na=False)
        ]

    # Filter by popularity (positive_ratings)
    if min_positive_ratings > 0 and "positive_ratings" in filtered.columns:
        filtered = filtered[filtered["positive_ratings"] >= min_positive_ratings]

    return filtered


In [7]:
# Recommendation: similar games given appid
def recommend_similar_by_appid(
    appid,
    n_recommendations=10,
    genre=None,
    max_price=None,
    min_positive_ratings=0
):
    """
    Recommend games similar to the one with the given appid.
    You can optionally filter by genre, max_price, and minimum positive_ratings.
    Returns a DataFrame with ALL original columns + 'similarity'.
    """
    if appid not in appid_to_index:
        raise ValueError(f"AppID {appid} not found in the dataset.")

    idx = appid_to_index[appid]

    # Compute similarity to all games
    sim_series = _get_similarity_series(idx)

    # Create a DataFrame with similarity and join with full df
    recs = pd.DataFrame({"similarity": sim_series})

    # Remove the game itself
    recs = recs.drop(index=idx, errors="ignore")

    # Join with ALL original columns (index-based)
    recs = recs.join(df, how="inner")

    # Apply optional filters
    recs = _apply_filters(
        recs,
        genre=genre,
        max_price=max_price,
        min_positive_ratings=min_positive_ratings
    )

    # Sort by similarity and popularity if available
    if "positive_ratings" in recs.columns:
        recs = recs.sort_values(
            by=["similarity", "positive_ratings"],
            ascending=[False, False]
        )
    else:
        recs = recs.sort_values(by="similarity", ascending=False)

    # Limit to top N
    recs = recs.head(n_recommendations).reset_index(drop=True)

    # Put similarity as the first column, then all original df columns
    cols = ["similarity"] + [c for c in df.columns if c != "similarity"]
    recs = recs[[c for c in cols if c in recs.columns]]

    return recs


In [8]:
# Search / filter games without a seed game
def filter_games(
    genre=None,
    max_price=None,
    min_positive_ratings=0,
    n_results=20
):
    """
    Return games that match the given filters, without using similarity.
    Results are sorted by positive_ratings (popularity) if available.
    Returns ALL original dataset columns.
    """
    # Start with full dataset
    filtered = df.copy()

    # Apply filters
    filtered = _apply_filters(
        filtered,
        genre=genre,
        max_price=max_price,
        min_positive_ratings=min_positive_ratings
    )

    # Sort by popularity if possible
    if "positive_ratings" in filtered.columns:
        filtered = filtered.sort_values(by="positive_ratings", ascending=False)

    filtered = filtered.head(n_results).reset_index(drop=True)
    return filtered




In [9]:
# Combined helper: seed game (by appid) + filters
def recommend_with_filters(
    appid=None,
    genre=None,
    max_price=None,
    min_positive_ratings=0,
    n_recommendations=10
):
    """
    If appid is given: content-based recommendations starting from that game,
    then filters are applied (genre, max_price, min_positive_ratings).

    If appid is None: purely filter-based recommendations (no similarity),
    using genre and/or max_price.

    All outputs contain ALL original dataset columns (plus similarity if appid is used).
    """
    if appid is not None:
        return recommend_similar_by_appid(
            appid=appid,
            n_recommendations=n_recommendations,
            genre=genre,
            max_price=max_price,
            min_positive_ratings=min_positive_ratings
        )
    else:
        return filter_games(
            genre=genre,
            max_price=max_price,
            min_positive_ratings=min_positive_ratings,
            n_results=n_recommendations
        )




In [10]:
# Example usage – tables with full details

# Example 1: similar games to a given appid, no filters
example_appid = df["appid"].iloc[0]
print(f"Example seed appid: {example_appid} - {df.loc[appid_to_index[example_appid], 'name']}")
recs1 = recommend_similar_by_appid(example_appid, n_recommendations=5)
display(recs1)

# Example 2: similar games with max price and a genre filter
recs2 = recommend_similar_by_appid(
    example_appid,
    n_recommendations=5,
    genre="Action",
    max_price=10.0
)
display(recs2)

# Example 3: only filter based on genre and price (no seed game)
filtered = filter_games(
    genre="RPG",
    max_price=5.0,
    min_positive_ratings=100,
    n_results=10
)
display(filtered)


Example seed appid: 10 - Counter-Strike


Unnamed: 0,similarity,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,...,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,content,name_clean
0,1.0,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,team fortress classic
1,1.0,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,deathmatch classic
2,0.98449,60,Ricochet,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Valve Anti-Ch...,...,Action;FPS;Multiplayer,0,2758,684,175,10,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,ricochet
3,0.958997,80,Counter-Strike: Condition Zero,2004-03-01,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,...,Action;FPS;Multiplayer,0,12120,1439,427,43,10000000-20000000,7.19,action; action;fps;multiplayer; single-player;...,counter-strike: condition zero
4,0.957503,360,Half-Life Deathmatch: Source,2006-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,...,Action;FPS;Multiplayer,0,1362,473,102,81,5000000-10000000,0.0,action; action;fps;multiplayer; multi-player;v...,half-life deathmatch: source


Unnamed: 0,similarity,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,...,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,content,name_clean
0,1.0,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,team fortress classic
1,1.0,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,deathmatch classic
2,0.98449,60,Ricochet,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Valve Anti-Ch...,...,Action;FPS;Multiplayer,0,2758,684,175,10,5000000-10000000,3.99,action; action;fps;multiplayer; multi-player;o...,ricochet
3,0.958997,80,Counter-Strike: Condition Zero,2004-03-01,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,...,Action;FPS;Multiplayer,0,12120,1439,427,43,10000000-20000000,7.19,action; action;fps;multiplayer; single-player;...,counter-strike: condition zero
4,0.957503,360,Half-Life Deathmatch: Source,2006-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,...,Action;FPS;Multiplayer,0,1362,473,102,81,5000000-10000000,0.0,action; action;fps;multiplayer; multi-player;v...,half-life deathmatch: source


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,content,name_clean
0,301520,Robocraft,2017-08-24,1,Freejam,Freejam,windows;mac;linux,0,Multi-player;Online Multi-Player;MMO;Online Co...,Action;Free to Play;Indie;Massively Multiplaye...,Free to Play;Robots;Building,43,80360,30987,1369,211,10000000-20000000,0.0,action;free to play;indie;massively multiplaye...,robocraft
1,238960,Path of Exile,2013-10-23,1,Grinding Gear Games,Grinding Gear Games,windows,0,Single-player;Multi-player;Online Multi-Player...,Action;Adventure;Free to Play;Indie;Massively ...,Free to Play;Action RPG;Hack and Slash,120,71593,6117,5263,492,10000000-20000000,0.0,action;adventure;free to play;indie;massively ...,path of exile
2,304050,Trove,2015-07-09,1,Trion Worlds,Trion Worlds,windows;mac,0,Online Multi-Player;MMO;Co-op;Steam Achievemen...,Action;Adventure;Casual;Free to Play;Massively...,Free to Play;Open World;Multiplayer,16,47897,13708,2602,184,5000000-10000000,0.0,action;adventure;casual;free to play;massively...,trove
3,113200,The Binding of Isaac,2011-09-28,1,Edmund McMillen and Florian Himsl,Edmund McMillen,windows;mac,0,Single-player;Steam Achievements;Steam Trading...,Action;Adventure;Indie;RPG,Rogue-like;Indie;Replay Value,99,43227,1923,1849,567,2000000-5000000,3.99,action;adventure;indie;rpg; rogue-like;indie;r...,the binding of isaac
4,363970,Clicker Heroes,2015-05-13,1,Playsaurus,Playsaurus,windows;mac,0,Single-player;Steam Achievements;Steam Trading...,Adventure;Casual;Free to Play;Indie;RPG;Simula...,Clicker;Free to Play;Casual,111,42530,4734,9742,463,5000000-10000000,0.0,adventure;casual;free to play;indie;rpg;simula...,clicker heroes
5,200210,Realm of the Mad God,2012-02-20,1,Wild Shadow Studios;Deca Games,Deca Games,windows;mac,0,Multi-player;MMO;Co-op;Cross-Platform Multipla...,Action;Free to Play;Indie;Massively Multiplaye...,Free to Play;Massively Multiplayer;Pixel Graphics,31,23414,4728,3531,64,2000000-5000000,0.0,action;free to play;indie;massively multiplaye...,realm of the mad god
6,104900,ORION: Prelude,2013-04-16,1,Spiral Game Studios,DANKIE,windows,0,Single-player;Multi-player;Online Multi-Player...,Action;Adventure;Indie;RPG,Dinosaurs;Action;FPS,251,22057,7558,263,98,2000000-5000000,0.79,action;adventure;indie;rpg; dinosaurs;action;f...,orion: prelude
7,588430,Fallout Shelter,2017-03-29,1,Bethesda Game Studios,Bethesda Softworks,windows,0,Single-player;Steam Achievements;Full controll...,Free to Play;RPG;Simulation,Free to Play;Survival;Base-Building,35,21481,3569,1116,93,2000000-5000000,0.0,free to play;rpg;simulation; free to play;surv...,fallout shelter
8,644560,Mirror,2018-04-19,1,KAGAMI WORKs,Paradise Project,windows;mac,0,Single-player;Steam Achievements;Steam Trading...,Adventure;Indie;RPG,Sexual Content;Nudity;Mature,114,21474,702,281,291,500000-1000000,1.69,adventure;indie;rpg; sexual content;nudity;mat...,mirror
9,280790,Creativerse,2017-05-08,1,Playful Corp.,Playful Corp.,windows;mac,0,Single-player;Multi-player;Online Multi-Player...,Action;Adventure;Casual;Free to Play;Indie;Mas...,Free to Play;Survival;Multiplayer,0,19114,4911,3292,138,2000000-5000000,0.0,action;adventure;casual;free to play;indie;mas...,creativerse


In [11]:
from ipywidgets import Text, FloatText, IntText, Button, HBox, VBox, Output
from IPython.display import display, clear_output

# ----- Widgets -----
appid_box = Text(
    value="",
    description="AppID:",
    placeholder="leave empty for none"
)

genre_box = Text(
    value="",
    description="Genre:",
    placeholder="e.g. Action, RPG"
)

max_price_box = Text(
    value="",
    description="Max price:",
    placeholder="e.g. 10.0"
)

min_pos_box = Text(
    value="",
    description="Min +ve:",
    placeholder="e.g. 100"
)

n_box = IntText(
    value=10,
    description="Results:"
)

run_button = Button(
    description="Get recommendations",
    button_style="primary",
    tooltip="Click to get recommended games"
)

out_area = Output()

# ----- Button callback -----
def on_run_button_clicked(b):
    with out_area:
        clear_output()

        # Convert all inputs safely
        appid_val = None
        if appid_box.value.strip() != "":
            try:
                appid_val = int(appid_box.value.strip())
            except:
                print("Invalid AppID. It must be an integer.")
                return

        genre_val = genre_box.value.strip() or None

        max_price_val = None
        if max_price_box.value.strip() != "":
            try:
                max_price_val = float(max_price_box.value.strip())
            except:
                print("Invalid price. Must be a number.")
                return

        min_pos_val = 0
        if min_pos_box.value.strip() != "":
            try:
                min_pos_val = int(min_pos_box.value.strip())
            except:
                print("Invalid minimum positive rating value.")
                return

        n_val = n_box.value if n_box.value is not None else 10

        try:
            results = recommend_with_filters(
                appid=appid_val,
                genre=genre_val,
                max_price=max_price_val,
                min_positive_ratings=min_pos_val,
                n_recommendations=n_val
            )

            if results.empty:
                print("No games found matching your criteria.")
            else:
                display(results)

        except Exception as e:
            print("Error:", e)

run_button.on_click(on_run_button_clicked)

# ----- Layout -----
form = VBox([
    HBox([appid_box, genre_box]),
    HBox([max_price_box, min_pos_box]),
    HBox([n_box]),
    run_button,
    out_area
])

display(form)


VBox(children=(HBox(children=(Text(value='', description='AppID:', placeholder='leave empty for none'), Text(v…