In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (./) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Summary of Findings

After exploring this dataset and building predictive models, here are the main takeaways:

**The Steam gaming landscape is dominated by a few massive titles.** Counter-Strike 2 and PUBG absolutely dwarf everything else, averaging over 300K players each. Meanwhile, the median game has only about 10 players - the distribution is extremely skewed.

**Game populations are surprisingly stable.** This was the biggest surprise. Month-to-month, player counts barely change. We found that 87% of next month's player count can be explained just by looking at this month's numbers. Games tend to maintain their playerbase rather than experiencing wild swings.

**Established games are easy to predict, new releases are chaos.** Our models achieved 97%+ accuracy predicting player counts for stable games like CS2, PUBG, and GTA V. But newer titles like HELLDIVERS 2? The model struggled with 57% error because these games don't follow historical patterns yet - they're still in their volatile launch phase.

**A simple rule works surprisingly well.** We compared our Random Forest model against a "dumb" baseline that just predicted "next month will be the same as last month." The naive approach got 98% accuracy. Our fancy model? 97%. Turns out the problem is just inherently easy because gaming populations are so sticky.

**Seasonality exists but is minor.** January sees slightly higher player counts (probably holiday gaming and New Year releases), but the seasonal effect is pretty small compared to the game-to-game differences.

**The platform has grown significantly.** From tracking about 1,000 games in 2012 to over 4,000 in 2025, Steam has massively expanded. But interestingly, the number of tracked games peaked around 2018 and has since declined slightly.

Overall, this analysis revealed that Steam's gaming ecosystem is highly predictable for mature titles, with player counts following strong momentum patterns. The real challenge in forecasting comes from new, viral releases that break the mold.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, IsolationForest
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import stats

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## Load and Prepare Data

In [None]:
df = pd.read_csv('/kaggle/input/steam-monthly-average-players/steamcharts.csv')

# basic prep
df['gain'] = pd.to_numeric(df['gain'], errors='coerce')
df['date'] = pd.to_datetime(df['month'], format='%b-%y', errors='coerce')
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month

# sort by game and date
df = df.sort_values(['steam_appid', 'date'])

print(f"Loaded {len(df):,} records")

# 1. Game Lifecycle Clustering

Group games based on their player count trajectories over time.

## Feature Engineering for Clustering

In [None]:
# calculate trajectory features for each game
def calculate_trajectory_features(group):
    if len(group) < 3:
        return None
    
    features = {
        'mean_players': group['avg_players'].mean(),
        'max_players': group['avg_players'].max(),
        'min_players': group['avg_players'].min(),
        'std_players': group['avg_players'].std(),
        'cv': group['avg_players'].std() / (group['avg_players'].mean() + 1),  # coefficient of variation
        'total_months': len(group),
        'growth_rate': group['gain_percent'].mean(),
        'peak_to_mean_ratio': group['avg_players'].max() / (group['avg_players'].mean() + 1),
        'months_declining': (group['gain'] < 0).sum() / len(group),
        'final_to_peak_ratio': group['avg_players'].iloc[-1] / (group['avg_players'].max() + 1)
    }
    
    return pd.Series(features)

game_features = df.groupby('name').apply(calculate_trajectory_features).dropna()
print(f"Created features for {len(game_features)} games")
game_features.head()

In [None]:
# filter out games with very low player counts to focus on meaningful patterns
game_features_filtered = game_features[game_features['mean_players'] >= 5].copy()
print(f"Filtered to {len(game_features_filtered)} games with mean >= 5 players")

## Apply K-Means Clustering

In [None]:
# standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(game_features_filtered)

# elbow method to find optimal k
inertias = []
k_range = range(2, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(features_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, marker='o', linewidth=2)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# use k=5 clusters based on elbow
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
game_features_filtered['cluster'] = kmeans.fit_predict(features_scaled)

print(f"Cluster distribution:")
print(game_features_filtered['cluster'].value_counts().sort_index())

## Analyze Clusters

In [None]:
# cluster characteristics
cluster_summary = game_features_filtered.groupby('cluster').agg({
    'mean_players': 'mean',
    'max_players': 'mean',
    'cv': 'mean',
    'growth_rate': 'mean',
    'months_declining': 'mean',
    'final_to_peak_ratio': 'mean',
    'total_months': 'mean'
}).round(2)

print("Cluster Characteristics:")
print(cluster_summary)

In [None]:
# visualize clusters in 2D (using PCA-like approach)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
features_2d = pca.fit_transform(features_scaled)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(features_2d[:, 0], features_2d[:, 1], 
                     c=game_features_filtered['cluster'], cmap='viridis', 
                     alpha=0.6, s=50)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('Game Clusters Visualization (PCA)')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# show example games from each cluster
for cluster_id in range(n_clusters):
    print(f"\nCluster {cluster_id} - Top 5 games by avg players:")
    cluster_games = game_features_filtered[game_features_filtered['cluster'] == cluster_id]
    top_games = cluster_games.nlargest(5, 'mean_players')
    for idx, row in top_games.iterrows():
        print(f"  {idx}: {row['mean_players']:.0f} avg players")

# 2. Next-Month Player Prediction (Regression)

Predict player count for the next month based on recent trends.

## Feature Engineering

In [None]:
# create lag features and rolling stats for each game
def create_prediction_features(group):
    group = group.sort_values('date').copy()
    
    # lag features
    for i in [1, 2, 3, 6]:
        group[f'avg_players_lag_{i}'] = group['avg_players'].shift(i)
        group[f'gain_lag_{i}'] = group['gain'].shift(i)
    
    # rolling statistics
    group['avg_players_roll_3'] = group['avg_players'].rolling(3, min_periods=1).mean()
    group['avg_players_roll_6'] = group['avg_players'].rolling(6, min_periods=1).mean()
    group['gain_roll_3'] = group['gain'].rolling(3, min_periods=1).mean()
    
    # trend features
    group['trend'] = group['avg_players'].pct_change()
    group['momentum'] = group['gain'].rolling(3, min_periods=1).mean()
    
    return group

df_features = df.groupby('steam_appid').apply(create_prediction_features).reset_index(drop=True)
print(f"Created features: {len(df_features)} records")

In [None]:
# add time-based features
df_features['month_sin'] = np.sin(2 * np.pi * df_features['month_num'] / 12)
df_features['month_cos'] = np.cos(2 * np.pi * df_features['month_num'] / 12)

# target: next month's player count
df_features['target'] = df_features.groupby('steam_appid')['avg_players'].shift(-1)

# drop rows with missing values
df_ml = df_features.dropna(subset=['target']).copy()
print(f"Final dataset: {len(df_ml)} records")

## Train Regression Models

In [None]:
# select features
feature_cols = ['avg_players', 'avg_players_lag_1', 'avg_players_lag_2', 'avg_players_lag_3',
                'gain_lag_1', 'gain_lag_2', 'avg_players_roll_3', 'avg_players_roll_6',
                'gain_roll_3', 'momentum', 'month_sin', 'month_cos']

# drop rows with NaN in ANY of our feature columns or target
df_ml_clean = df_ml[feature_cols + ['target', 'date']].dropna().copy()

X = df_ml_clean[feature_cols]
y = df_ml_clean['target']

print(f"After removing NaNs: {len(X)} records")

# temporal split - train on older data, test on recent
split_date = df_ml_clean['date'].quantile(0.8)
train_mask = df_ml_clean['date'] <= split_date
test_mask = df_ml_clean['date'] > split_date

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Train set: {len(X_train)} records")
print(f"Test set: {len(X_test)} records")
print(f"Split date: {split_date}")

In [None]:
# try multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2, 'model': model}
    
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R²: {r2:.4f}")

In [None]:
# visualize predictions vs actual
best_model_name = min(results.keys(), key=lambda k: results[k]['RMSE'])
best_model = results[best_model_name]['model']
y_pred = best_model.predict(X_test)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# scatter plot
axes[0].scatter(y_test, y_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Players')
axes[0].set_ylabel('Predicted Players')
axes[0].set_title(f'{best_model_name} - Predictions vs Actual')
axes[0].set_xscale('log')
axes[0].set_yscale('log')

# residuals
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Players')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].set_xscale('log')

plt.tight_layout()
plt.show()

In [None]:
# feature importance (for Random Forest)
if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance - Random Forest')
    plt.tight_layout()
    plt.show()

### True Forecasting Challenge - Predict Without Current Month Data

The above model achieved ~99% R² because it used current month's player count to predict next month. While accurate, this is somewhat trivial since player counts are sticky.

Let's make it harder: **predict next month using ONLY historical data (no current month)**. This is true forecasting.

In [None]:
# remove current month from features - only use historical lags
forecast_feature_cols = ['avg_players_lag_1', 'avg_players_lag_2', 'avg_players_lag_3',
                         'gain_lag_1', 'gain_lag_2', 'avg_players_roll_3', 'avg_players_roll_6',
                         'gain_roll_3', 'momentum', 'month_sin', 'month_cos']

# use same cleaned dataset
X_forecast = df_ml_clean[forecast_feature_cols].reset_index(drop=True)
y_forecast = df_ml_clean['target'].reset_index(drop=True)
dates = df_ml_clean['date'].reset_index(drop=True)

# recreate temporal split with aligned indices
train_mask_f = dates <= split_date
test_mask_f = dates > split_date

X_train_f = X_forecast[train_mask_f]
X_test_f = X_forecast[test_mask_f]
y_train_f = y_forecast[train_mask_f]
y_test_f = y_forecast[test_mask_f]

print(f"True Forecasting - Train: {len(X_train_f)}, Test: {len(X_test_f)}")

In [None]:
# train models for true forecasting
forecast_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

forecast_results = {}
for name, model in forecast_models.items():
    print(f"\nTraining {name} (No Current Month)...")
    model.fit(X_train_f, y_train_f)
    
    y_pred_f = model.predict(X_test_f)
    
    rmse = np.sqrt(mean_squared_error(y_test_f, y_pred_f))
    mae = mean_absolute_error(y_test_f, y_pred_f)
    r2 = r2_score(y_test_f, y_pred_f)
    
    forecast_results[name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2, 'model': model}
    
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R²: {r2:.4f}")

In [None]:
# compare baseline vs true forecasting
comparison_data = []
for name in ['Linear Regression', 'Random Forest']:
    comparison_data.append({
        'Model': f"{name}\n(with current month)",
        'R²': results[name]['R2'],
        'MAE': results[name]['MAE']
    })
    comparison_data.append({
        'Model': f"{name}\n(historical only)",
        'R²': forecast_results[name]['R2'],
        'MAE': forecast_results[name]['MAE']
    })

comparison_df = pd.DataFrame(comparison_data)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# R² comparison
x_pos = np.arange(len(comparison_df))
colors = ['#1f77b4', '#ff7f0e', '#1f77b4', '#ff7f0e']
axes[0].bar(x_pos, comparison_df['R²'], color=colors, alpha=0.7, edgecolor='black')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(comparison_df['Model'], fontsize=9)
axes[0].set_ylabel('R² Score')
axes[0].set_title('Model Comparison - R² Score')
axes[0].set_ylim([0, 1])
axes[0].grid(True, alpha=0.3, axis='y')

# MAE comparison
axes[1].bar(x_pos, comparison_df['MAE'], color=colors, alpha=0.7, edgecolor='black')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(comparison_df['Model'], fontsize=9)
axes[1].set_ylabel('Mean Absolute Error')
axes[1].set_title('Model Comparison - MAE (Lower is Better)')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# feature importance for true forecasting
rf_forecast = forecast_results['Random Forest']['model']
feature_importance_forecast = pd.DataFrame({
    'feature': forecast_feature_cols,
    'importance': rf_forecast.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance_forecast['feature'], feature_importance_forecast['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - True Forecasting (Historical Data Only)')
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
print(feature_importance_forecast.head())

In [None]:
# generate predictions from the best model
best_forecast_model = forecast_results['Random Forest']['model']
y_pred_forecast = best_forecast_model.predict(X_test_f)

print(f"Generated {len(y_pred_forecast)} predictions")

In [None]:
# better approach - keep game identifiers throughout
# recreate test set with all needed columns from df_ml
test_cols = ['date', 'steam_appid'] + forecast_feature_cols + ['target']
test_full_data = df_ml[test_cols].dropna()

# recreate the clean dataset with steam_appid
test_full_data_clean = test_full_data[test_full_data['date'] > split_date].copy()

# get predictions for this test set
X_test_viz = test_full_data_clean[forecast_feature_cols]
y_test_viz = test_full_data_clean['target']
y_pred_viz = best_forecast_model.predict(X_test_viz)

# create results dataframe
test_results = test_full_data_clean[['date', 'steam_appid']].copy()
test_results['actual'] = y_test_viz.values
test_results['predicted'] = y_pred_viz

# merge with game names
test_results = test_results.merge(df[['steam_appid', 'name']].drop_duplicates(), on='steam_appid', how='left')

print(f"Test results: {len(test_results)} predictions")
print(f"Unique games: {test_results['name'].nunique()}")
print(f"Sample games: {test_results['name'].value_counts().head()}")

# visualize
games_to_visualize = ['PUBG: BATTLEGROUNDS', 'Counter-Strike 2', 'Dota 2', 'Rust']

fig, axes = plt.subplots(len(games_to_visualize), 1, figsize=(14, 12))

for idx, game in enumerate(games_to_visualize):
    game_full = df[df['name'] == game].sort_values('date')
    game_test = test_results[test_results['name'] == game].sort_values('date')
    
    if len(game_full) > 0:
        axes[idx].plot(game_full['date'], game_full['avg_players'], 
                      label='Actual (Full History)', linewidth=2, color='blue', alpha=0.7)
        
        if len(game_test) > 0:
            axes[idx].plot(game_test['date'], game_test['predicted'], 
                          label='Model Prediction', linewidth=2.5, 
                          color='red', linestyle='--', marker='o', markersize=4)
            print(f"{game}: {len(game_test)} test predictions")
        else:
            print(f"{game}: NO test predictions found")
        
        axes[idx].axvline(x=split_date, color='green', linestyle=':', linewidth=2, label='Train/Test Split')
        axes[idx].set_ylabel('Average Players')
        axes[idx].set_title(f'{game} - Time Series: Actual vs Predicted')
        axes[idx].legend(loc='best')
        axes[idx].grid(True, alpha=0.3)
    else:
        axes[idx].text(0.5, 0.5, f'{game} - No data available', 
                      ha='center', va='center', transform=axes[idx].transAxes)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

In [None]:
# TOP 5 GAMES VISUALIZATION
print("="*60)
print("TOP 5 GAMES - ACTUAL VS PREDICTED")
print("="*60)

# get top 5 games by average player count
top_5_games = df.groupby('name')['avg_players'].mean().nlargest(5).index.tolist()
print(f"\nTop 5 games by average player count:")
for i, game in enumerate(top_5_games, 1):
    avg = df[df['name'] == game]['avg_players'].mean()
    print(f"  {i}. {game}: {avg:,.0f} avg players")

# visualize top 5
fig, axes = plt.subplots(len(top_5_games), 1, figsize=(14, 14))

for idx, game in enumerate(top_5_games):
    game_full = df[df['name'] == game].sort_values('date')
    game_test = test_results[test_results['name'] == game].sort_values('date')
    
    if len(game_full) > 0:
        axes[idx].plot(game_full['date'], game_full['avg_players'], 
                      label='Actual (Full History)', linewidth=2, color='blue', alpha=0.7)
        
        if len(game_test) > 0:
            axes[idx].plot(game_test['date'], game_test['predicted'], 
                          label='Model Prediction', linewidth=2.5, 
                          color='red', linestyle='--', marker='o', markersize=4)
            
            # calculate accuracy for this game
            game_mae = np.abs(game_test['actual'] - game_test['predicted']).mean()
            game_mape = (np.abs((game_test['actual'] - game_test['predicted']) / (game_test['actual'] + 1)).mean() * 100)
            print(f"\n{game}:")
            print(f"  Test predictions: {len(game_test)}")
            print(f"  MAE: {game_mae:.2f} players")
            print(f"  MAPE: {game_mape:.2f}%")
        
        axes[idx].axvline(x=split_date, color='green', linestyle=':', linewidth=2, 
                         label='Train/Test Split', alpha=0.6)
        axes[idx].set_ylabel('Average Players')
        axes[idx].set_title(f'#{idx+1}: {game} - Actual vs Predicted Player Counts')
        axes[idx].legend(loc='best')
        axes[idx].grid(True, alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

In [None]:
# analyze what's ACTUALLY driving the predictions
print("="*60)
print("WHAT'S REALLY PREDICTING THE FUTURE?")
print("="*60)

# look at correlation between features and target in test set
test_correlations = pd.DataFrame({
    'feature': forecast_feature_cols,
    'correlation_with_target': [test_full_data_clean[col].corr(test_full_data_clean['target']) 
                                for col in forecast_feature_cols]
}).sort_values('correlation_with_target', ascending=False)

print("\nCorrelation with Next Month's Player Count:")
print(test_correlations)

# check autocorrelation
print("\n" + "="*60)
print("AUTOCORRELATION TEST")
print("="*60)
sample_game = 'Counter-Strike 2'
game_data = df[df['name'] == sample_game].sort_values('date')[['date', 'avg_players']].tail(50)
game_data['next_month'] = game_data['avg_players'].shift(-1)
correlation = game_data['avg_players'].corr(game_data['next_month'])
print(f"\n{sample_game}:")
print(f"Correlation between this month and next month: {correlation:.4f}")
print(f"This means: {correlation**2:.2%} of next month's value is explained by this month!")

# show the "naive" baseline
print("\n" + "="*60)
print("NAIVE BASELINE COMPARISON")
print("="*60)
print("What if we just predicted: next_month = last_month?")
naive_predictions = test_full_data_clean['avg_players_lag_1']
naive_mae = mean_absolute_error(y_test_viz, naive_predictions)
naive_rmse = np.sqrt(mean_squared_error(y_test_viz, naive_predictions))
naive_r2 = r2_score(y_test_viz, naive_predictions)

print(f"\nNaive Model (just use lag_1 as prediction):")
print(f"  MAE: {naive_mae:.2f}")
print(f"  RMSE: {naive_rmse:.2f}")
print(f"  R²: {naive_r2:.4f}")

print(f"\nOur Random Forest Model:")
print(f"  MAE: {forecast_results['Random Forest']['MAE']:.2f}")
print(f"  RMSE: {forecast_results['Random Forest']['RMSE']:.2f}")
print(f"  R²: {forecast_results['Random Forest']['R2']:.4f}")

print(f"\nImprovement over naive baseline:")
print(f"  MAE improvement: {((naive_mae - forecast_results['Random Forest']['MAE']) / naive_mae * 100):.2f}%")