In [2]:
pip install dash --upgrade blinker --ignore-installed plotly pandas numpy scipy scikit-learn statsmodels seaborn matplotlib joblib pingouin

Collecting dash
  Downloading dash-3.0.2-py3-none-any.whl.metadata (10 kB)
Collecting blinker
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting plotly
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
 

In [2]:
import dash
from dash import dcc, html, Input, Output, Dash
import plotly.express as px
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from scipy.stats import ttest_ind
from scipy import sparse
from scipy.sparse import csr_matrix
import pingouin as pg
from statsmodels.tsa.seasonal import seasonal_decompose
from typing import Tuple, Union, List
from sklearn.metrics.pairwise import cosine_similarity

# --- Data Preprocessing ---
def load_and_clean_data(file_path="games.csv"):
    try:
        df = pd.read_csv(file_path)
        df['date_release'] = pd.to_datetime(df['date_release'], errors='coerce')
        required_cols = ['title', 'rating', 'positive_ratio']
        if not all(col in df.columns for col in required_cols):
            raise ValueError("Dataset missing required columns.")
        df = df.dropna(subset=required_cols).drop_duplicates(subset=['title'])
        df['year'] = df['date_release'].dt.year

        # Ensure numeric and boolean types for recommendation matrix columns
        le_rating = LabelEncoder()
        df['rating_encoded'] = le_rating.fit_transform(df['rating'].astype(str))
        df['price_final'] = pd.to_numeric(df.get('price_final', 0), errors='coerce').fillna(0)
        df['positive_ratio'] = pd.to_numeric(df['positive_ratio'], errors='coerce').fillna(0)
        df['win'] = df.get('win', False).astype(int)  # Convert bool/strings to 0/1
        df['mac'] = df.get('mac', False).astype(int)
        df['linux'] = df.get('linux', False).astype(int)
        df['user_reviews'] = pd.to_numeric(df.get('user_reviews', 0), errors='coerce').fillna(0)
        df['steam_deck'] = df.get('steam_deck', False).astype(int)

        scaler = StandardScaler()
        numerical_cols = ['price_final', 'discount', 'positive_ratio']
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols].fillna(0).astype(float))
        print(f"Data cleaned, shape: {df.shape}")
        print(f"Sample data: {df[['year', 'price_final', 'discount', 'win', 'mac', 'linux']].head()}")
        return df
    except Exception as e:
        print(f"Error during data cleaning: {str(e)}")
        return pd.DataFrame()

# --- Rating Analysis ---
def analyze_ratings(df, selected_title=None):
    if not all(col in df.columns for col in ['rating', 'positive_ratio']):
        return None, None, None
    try:
        rating_counts = df['rating'].value_counts().reset_index()
        rating_counts.columns = ['rating', 'count']
        avg_positive_by_rating = df.groupby('rating')['positive_ratio'].mean()
        stats_summary = df.groupby('rating')['positive_ratio'].agg(['mean', 'std', 'count']).reset_index()
        stats_summary = stats_summary[stats_summary['count'] > 0].copy()
        stats_summary['std'] = stats_summary['std'].fillna(0)
        confidence_level = 0.95
        stats_summary['ci_lower'] = stats_summary.apply(
            lambda row: ttest_ind(df[df['rating'] == row['rating']]['positive_ratio'],
                                  df[df['rating'] != row['rating']]['positive_ratio'],
                                  equal_var=False)[1] if row['count'] > 1 else row['mean'], axis=1)
        stats_summary['ci_upper'] = stats_summary['mean']

        if selected_title and selected_title in df['title'].values:
            selected_rating = df[df['title'] == selected_title]['rating'].iloc[0]
            rating_counts['selected'] = rating_counts['rating'].apply(lambda x: 'Selected' if x == selected_rating else 'Others')
        else:
            rating_counts['selected'] = 'Others'

        return rating_counts, avg_positive_by_rating, stats_summary
    except Exception as e:
        print(f"Error in rating analysis: {str(e)}")
        return None, None, None

# --- Price/Discount Trends ---
def price_trends(df, selected_title=None, input_price=None, input_discount=None):
    required_cols = ['year', 'price_final', 'discount', 'date_release']
    if not all(col in df.columns for col in required_cols):
        print(f"Missing columns for price trends: {set(required_cols) - set(df.columns)}")
        return pd.DataFrame({'year': [0], 'price_final': [0.0], 'discount': [0.0]})
    try:
        df['price_final'] = pd.to_numeric(df['price_final'], errors='coerce').fillna(0)
        df['discount'] = pd.to_numeric(df['discount'], errors='coerce').fillna(0)
        df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(df['year'].min() if not df['year'].isna().all() else 0)

        yearly_avg = df.groupby('year')[['price_final', 'discount']].mean().reset_index()
        correlation = df['price_final'].corr(df['discount'])
        print(f"Correlation between price and discount: {correlation:.4f}")

        if pd.api.types.is_datetime64_any_dtype(df['date_release']):
            ts = df.dropna(subset=['date_release', 'price_final']).set_index('date_release')['price_final'].resample('YE').mean().dropna()
            if len(ts) > 1:
                decomposition = seasonal_decompose(ts, model='additive', period=1)
                trend = decomposition.trend
                yearly_avg['year'] = pd.to_datetime(yearly_avg['year'].astype(int), format='%Y')
                yearly_avg['trend'] = trend.reindex(yearly_avg['year'], method='ffill').fillna(method='bfill')
                yearly_avg['year'] = yearly_avg['year'].dt.year
            else:
                yearly_avg['trend'] = yearly_avg['price_final']
        else:
            yearly_avg['trend'] = yearly_avg['price_final']

        if selected_title and selected_title in df['title'].values:
            selected_game = df[df['title'] == selected_title][['year', 'price_final', 'discount']].iloc[0]
            if input_price is not None and input_discount is not None:
                yearly_avg.loc[yearly_avg['year'] == selected_game['year'], 'price_final'] = input_price
                yearly_avg.loc[yearly_avg['year'] == selected_game['year'], 'discount'] = input_discount
            else:
                yearly_avg.loc[yearly_avg['year'] == selected_game['year'], 'price_final'] = selected_game['price_final']
                yearly_avg.loc[yearly_avg['year'] == selected_game['year'], 'discount'] = selected_game['discount']

        yearly_avg = yearly_avg.fillna(0)
        print(f"Price trends DataFrame: {yearly_avg}")
        return yearly_avg
    except Exception as e:
        print(f"Error in price trend analysis: {str(e)}")
        return pd.DataFrame({'year': [0], 'price_final': [0.0], 'discount': [0.0]})

# --- Recommendation System (Fixed) ---
def build_recommendation_matrix(df, method='cosine'):
    required_cols = ['rating_encoded', 'positive_ratio', 'price_final', 'win', 'mac', 'linux']
    if not all(col in df.columns for col in required_cols):
        print(f"Missing columns for recommendation matrix: {set(required_cols) - set(df.columns)}")
        return None
    try:
        # Ensure all columns are numeric
        feature_matrix = df[required_cols].copy()
        for col in feature_matrix.columns:
            feature_matrix[col] = pd.to_numeric(feature_matrix[col], errors='coerce').fillna(0)
        sparse_matrix = csr_matrix(feature_matrix.values.astype(float))  # Explicitly cast to float
        similarity_matrix = cosine_similarity(sparse_matrix, dense_output=False)
        print(f"Recommendation matrix built with shape: {similarity_matrix.shape}")
        return similarity_matrix
    except Exception as e:
        print(f"Error building recommendation matrix: {str(e)}")
        return None

def recommend_games(title, df, similarity_matrix, n_recommendations=5):
    if title not in df['title'].values:
        return ["Game not found"]
    try:
        idx = df[df['title'] == title].index[0]
        sim_scores = similarity_matrix[idx].toarray().flatten()
        sim_scores = list(enumerate(sim_scores))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations+1]
        game_indices = [i[0] for i in sim_scores]
        return df['title'].iloc[game_indices].drop_duplicates().tolist()
    except Exception as e:
        print(f"Error in recommendations: {str(e)}")
        return ["Error in recommendation"]

# --- Success Prediction ---
def train_success_model(df):
    required_cols = ['price_final', 'discount', 'user_reviews', 'win', 'mac', 'linux', 'steam_deck', 'positive_ratio']
    if not all(col in df.columns for col in required_cols):
        return None, None, None
    try:
        X = df[required_cols[:-1]]
        y = df['positive_ratio']
        X_train, X_test, y_train, y_test = train_test_split(X.fillna(0), y, test_size=0.2, random_state=42)
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        r2_score = model.score(X_test, y_test)
        return model, r2_score, 0
    except Exception as e:
        print(f"Error in success prediction: {str(e)}")
        return None, None, None

# --- Steam Deck Impact ---
def steam_deck_impact(df):
    if 'steam_deck' not in df.columns or 'positive_ratio' not in df.columns:
        return None, None, None, None
    try:
        deck_games = df[df['steam_deck'] == True]['positive_ratio'].dropna()
        no_deck_games = df[df['steam_deck'] == False]['positive_ratio'].dropna()
        if len(deck_games) < 2 or len(no_deck_games) < 2:
            return None, None, None, None
        t_stat, p_val = ttest_ind(deck_games, no_deck_games, equal_var=False)
        effect_size = pg.compute_effsize(deck_games, no_deck_games, eftype='cohen')
        return t_stat, p_val, effect_size, None
    except Exception as e:
        print(f"Error in Steam Deck impact analysis: {str(e)}")
        return None, None, None, None

# --- Tag Prediction ---
def train_tag_model(df, text_column='title'):
    if text_column not in df.columns or 'rating' not in df.columns:
        return None, None, None
    try:
        tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
        X = tfidf.fit_transform(df[text_column].fillna('').astype(str))
        y = df['rating']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        accuracy = model.score(X_test, y_test)
        return model, tfidf, accuracy
    except Exception as e:
        print(f"Error in tag prediction: {str(e)}")
        return None, None, None

# --- Dynamic Pricing ---
def train_pricing_model(df):
    required_cols = ['positive_ratio', 'user_reviews', 'win', 'mac', 'linux', 'steam_deck', 'price_final']
    if not all(col in df.columns for col in required_cols):
        return None, None, None
    try:
        X = df[required_cols[:-1]]
        y = df['price_final']
        X_train, X_test, y_train, y_test = train_test_split(X.fillna(0), y, test_size=0.2, random_state=42)
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        r2_score = model.score(X_test, y_test)
        return model, r2_score, 0
    except Exception as e:
        print(f"Error in pricing model: {str(e)}")
        return None, None, None

# --- Load Data and Models ---
df = load_and_clean_data("games.csv")
if df.empty:
    print("Failed to load data. Using empty DataFrame.")
cosine_sim = build_recommendation_matrix(df) if not df.empty else None
success_model, _, _ = train_success_model(df) if not df.empty else (None, None, None)
pricing_model, _, _ = train_pricing_model(df) if not df.empty else (None, None, None)
tag_model, tfidf, _ = train_tag_model(df) if not df.empty else (None, None, None)

# Save models if not already saved (optional)
if cosine_sim is not None:
    joblib.dump(cosine_sim, 'recommendation_matrix.joblib')
if success_model is not None:
    joblib.dump(success_model, 'success_model.joblib')
if pricing_model is not None:
    joblib.dump(pricing_model, 'pricing_model.joblib')
if tag_model is not None and tfidf is not None:
    joblib.dump(tag_model, 'tag_model.joblib')
    joblib.dump(tfidf, 'tfidf.joblib')

# Initialize Dash app
app = dash.Dash(__name__)

# Layout
app.layout = html.Div([
    html.H1("Steam Game Analytics and Recommendation Platform"),
    html.P("Analyze, predict, and recommend Steam games!"),

    html.Label("Game Title:"),
    dcc.Input(id="title-input", type="text", placeholder="e.g., Dungeon of the ENDLESS™", value=""),

    html.Label("Price ($):"),
    dcc.Slider(id="price-input", min=0, max=100, step=1, value=10, marks={i: str(i) for i in range(0, 101, 20)}),

    html.Label("Discount (%):"),
    dcc.Slider(id="discount-input", min=0, max=100, step=1, value=0, marks={i: str(i) for i in range(0, 101, 20)}),

    dcc.Graph(id="rating-distribution"),
    dcc.Graph(id="price-trends"),
    html.Div(id="recommendations-output"),
    html.Div(id="success-prediction-output"),
    html.Div(id="steam-deck-impact-output"),
    html.Div(id="tag-prediction-output"),
    html.Div(id="pricing-suggestion-output")
])

# Callback to update dashboard
@app.callback(
    [Output("rating-distribution", "figure"),
     Output("price-trends", "figure"),
     Output("recommendations-output", "children"),
     Output("success-prediction-output", "children"),
     Output("steam-deck-impact-output", "children"),
     Output("tag-prediction-output", "children"),
     Output("pricing-suggestion-output", "children")],
    [Input("title-input", "value"),
     Input("price-input", "value"),
     Input("discount-input", "value")]
)
def update_dashboard(title_input: str, price_input: float, discount_input: float) -> Tuple:
    print(f"Callback triggered with: Title={title_input}, Price={price_input}, Discount={discount_input}")

    # Default outputs in case of errors
    default_fig = px.bar(title="Error: Unable to Process")
    default_text = "Error: Unable to Process"

    # Input validation
    if not title_input or (not df.empty and title_input not in df['title'].values):
        error_msg = f"Error: Game '{title_input}' not found or invalid input"
        print(f"Returning error: {error_msg}")
        return (default_fig, default_fig, error_msg, error_msg, error_msg, error_msg, error_msg)

    try:
        # Rating Analysis
        rating_counts, _, _ = analyze_ratings(df, title_input)
        if rating_counts is None:
            print("Rating analysis returned None")
            rating_fig = default_fig
        else:
            rating_fig = px.bar(rating_counts, x="rating", y="count", color="selected",
                                title="Rating Distribution (Selected Game Highlighted)",
                                color_discrete_map={'Selected': 'red', 'Others': 'blue'})

        # Price Trends
        trends = price_trends(df, title_input, price_input, discount_input)
        if trends is None or trends.empty:
            print("Price trends returned None or empty")
            trends_fig = default_fig
        else:
            trends_fig = px.line(trends, x="year", y=["price_final", "discount"],
                                 title=f"Price/Discount Trends (Adjusted for {title_input})")
            trends_fig.update_layout(yaxis_range=[trends[['price_final', 'discount']].min().min() - 1,
                                                  trends[['price_final', 'discount']].max().max() + 1])

        # Recommendations
        if cosine_sim is None or df.empty:
            recs_output = "Recommendation matrix unavailable"
        else:
            recs = recommend_games(title_input, df, cosine_sim)
            if recs is None or "Error" in recs[0]:
                print("Recommendations failed")
                recs_output = "No recommendations available"
            else:
                recs_output = html.Ul([html.Li(game) for game in recs])

        # Success Prediction
        if success_model is None or df.empty:
            success_output = "Success prediction unavailable"
        else:
            game_data = df[df['title'] == title_input][['price_final', 'discount', 'user_reviews', 'win', 'mac', 'linux', 'steam_deck']].iloc[0]
            game_data_df = pd.DataFrame([game_data], columns=['price_final', 'discount', 'user_reviews', 'win', 'mac', 'linux', 'steam_deck'])
            success_pred = success_model.predict(game_data_df)[0]
            success_output = f"Predicted Positive Ratio: {success_pred:.2f}%"

        # Steam Deck Impact
        t_stat, p_val, effect_size, _ = steam_deck_impact(df)
        if p_val is None:
            print("Steam Deck impact returned None")
            deck_impact = "Steam Deck impact analysis failed"
        else:
            deck_impact = f"Steam Deck Impact - p-value: {p_val:.4f}, Effect Size: {effect_size:.4f}"

        # Tag Prediction
        if tfidf is None or tag_model is None or df.empty:
            tag_output = "No tag prediction available"
        else:
            title_vec = tfidf.transform([title_input]).toarray()
            tag_pred = tag_model.predict(title_vec)[0]
            tag_output = f"Inferred Rating/Tag: {tag_pred}"

        # Dynamic Pricing
        if pricing_model is None or df.empty:
            pricing_output = f"Suggested Price: ${price_input:.2f} (default)"
        else:
            pricing_input = [success_pred if success_pred is not None else 0, df['user_reviews'].mean() if not df.empty else 0, 1, 0, 0, 1]
            pricing_input_df = pd.DataFrame([pricing_input], columns=['positive_ratio', 'user_reviews', 'win', 'mac', 'linux', 'steam_deck'])
            price_suggestion = pricing_model.predict(pricing_input_df)[0]
            pricing_output = f"Suggested Price: ${price_suggestion:.2f}"

        print("Callback returning successful outputs")
        return (rating_fig, trends_fig, recs_output, success_output, deck_impact, tag_output, pricing_output)

    except Exception as e:
        error_msg = f"Error in callback: {str(e)}"
        print(error_msg)
        return (default_fig, default_fig, error_msg, error_msg, error_msg, error_msg, error_msg)

# Run the app
if __name__ == "__main__":
    app.run(debug=True)

Data cleaned, shape: (50751, 15)
Sample data:    year  price_final  discount  win  mac  linux
0  2008     0.117803 -0.300677    1    0      0
1  2011    -0.489831 -0.300677    1    0      0
2  2013     0.551827 -0.300677    1    1      1
3  2014     0.551827 -0.300677    1    0      0
4  2014     0.291412 -0.300677    1    1      0
Recommendation matrix built with shape: (50751, 50751)


<IPython.core.display.Javascript object>