In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')


**Load and Inspect Data**

In [None]:
df = pd.read_csv('/kaggle/input/steam-monthly-average-players/steamcharts.csv')
print(f"Shape: {df.shape}")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# check for missing values
df.isnull().sum()

## Data Cleaning

Need to fix a few things:
- Convert 'gain' column to numeric (currently object type)
- Parse the month column into proper datetime
- Check for any data quality issues

In [None]:
# fix gain column
df['gain'] = pd.to_numeric(df['gain'], errors='coerce')
# convert month string to datetime
df['date'] = pd.to_datetime(df['month'], format='%b-%y', errors='coerce')
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month

## Exploratory Analysis

### Player Count Distribution

In [None]:
# basic stats on player counts
print("Average Players Statistics:")
print(f"Mean: {df['avg_players'].mean():.2f}")
print(f"Median: {df['avg_players'].median():.2f}")
print(f"Max: {df['avg_players'].max():.2f}")
print(f"Std: {df['avg_players'].std():.2f}")

In [None]:
# the distribution is probably very skewed, let's check percentiles
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    val = np.percentile(df['avg_players'], p)
    print(f"{p}th percentile: {val:.2f}")

In [None]:
# visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# histogram
axes[0].hist(df['avg_players'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Average Players')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Average Players')

# log scale to see it better
axes[1].hist(np.log10(df['avg_players'] + 1), bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Log10(Average Players + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution (Log Scale)')

plt.tight_layout()
plt.show()

In [None]:
# box plot
plt.figure(figsize=(10, 6))
plt.boxplot(df['avg_players'], vert=False)
plt.xlabel('Average Players')
plt.title('Box Plot of Average Players')
plt.show()

The distribution is extremely right-skewed, with most games having very few players and a small number of games with massive player counts.

### Top Games

In [None]:
# top games by average player count
top_games = df.groupby('name')['avg_players'].mean().sort_values(ascending=False).head(20)
print("Top 20 Games by Average Players:")
print(top_games)

In [None]:
# visualize top 15
plt.figure(figsize=(10, 8))
top_games.head(15).plot(kind='barh', color='steelblue')
plt.xlabel('Average Players')
plt.title('Top 15 Games by Average Player Count')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# what about peak players?
top_peak = df.groupby('name')['peak_players'].max().sort_values(ascending=False).head(15)
print("\nTop 15 Games by Peak Players:")
print(top_peak)

### Time Series Analysis

Let's look at how player counts have evolved over time

In [None]:
# aggregate by month
monthly_stats = df.groupby('date').agg({
    'avg_players': 'sum',
    'peak_players': 'sum',
    'steam_appid': 'nunique'
}).rename(columns={'steam_appid': 'num_games'})

monthly_stats.head()

In [None]:
# plot overall trends
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

axes[0].plot(monthly_stats.index, monthly_stats['avg_players'], linewidth=2)
axes[0].set_ylabel('Total Average Players')
axes[0].set_title('Total Average Players Over Time')
axes[0].grid(True, alpha=0.3)

axes[1].plot(monthly_stats.index, monthly_stats['num_games'], color='orange', linewidth=2)
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Number of Games Tracked')
axes[1].set_title('Number of Games Tracked Over Time')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# look at some specific popular games over time
games_to_plot = ['Counter-Strike 2', 'PUBG: BATTLEGROUNDS', 'Dota 2', 
                 'Team Fortress 2', 'Grand Theft Auto V Legacy']

plt.figure(figsize=(14, 8))
for game in games_to_plot:
    game_data = df[df['name'] == game].sort_values('date')
    if len(game_data) > 0:
        plt.plot(game_data['date'], game_data['avg_players'], label=game, linewidth=2)

plt.xlabel('Date')
plt.ylabel('Average Players')
plt.title('Player Trends for Selected Popular Games')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Growth and Decline Patterns

In [None]:
# look at gain/loss patterns
print(f"Records with positive growth: {(df['gain'] > 0).sum()} ({(df['gain'] > 0).mean()*100:.1f}%)")
print(f"Records with decline: {(df['gain'] < 0).sum()} ({(df['gain'] < 0).mean()*100:.1f}%)")
print(f"Records with no change: {(df['gain'] == 0).sum()} ({(df['gain'] == 0).mean()*100:.1f}%)")

In [None]:
# distribution of gain/loss
plt.figure(figsize=(12, 6))
plt.hist(df['gain'].dropna(), bins=100, edgecolor='black', alpha=0.7)
plt.xlabel('Monthly Gain/Loss (players)')
plt.ylabel('Frequency')
plt.title('Distribution of Monthly Player Gain/Loss')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='No change')
plt.legend()
plt.show()

In [None]:
# biggest gains and losses
print("Biggest Monthly Gains:")
biggest_gains = df.nlargest(10, 'gain')[['name', 'month', 'gain', 'avg_players']]
print(biggest_gains)

print("\nBiggest Monthly Losses:")
biggest_losses = df.nsmallest(10, 'gain')[['name', 'month', 'gain', 'avg_players']]
print(biggest_losses)

### Seasonality

In [None]:
# average by month of year
df['month_name'] = df['date'].dt.month_name()
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']

monthly_avg = df.groupby('month_name')['avg_players'].mean().reindex(month_order)
print("Average Players by Month:")
print(monthly_avg)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(range(12), monthly_avg.values, color='teal', alpha=0.7, edgecolor='black')
plt.xticks(range(12), month_order, rotation=45)
plt.ylabel('Average Players')
plt.title('Seasonal Pattern: Average Players by Month')
plt.tight_layout()
plt.show()

### Peak vs Average Analysis

In [None]:
# ratio of peak to average
df['peak_to_avg_ratio'] = df['peak_players'] / (df['avg_players'] + 1)  # avoid division by zero

print(f"Mean peak/avg ratio: {df['peak_to_avg_ratio'].mean():.2f}")
print(f"Median peak/avg ratio: {df['peak_to_avg_ratio'].median():.2f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df['peak_to_avg_ratio'].clip(upper=20), bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Peak to Average Ratio')
plt.ylabel('Frequency')
plt.title('Distribution of Peak to Average Player Ratio (capped at 20)')
plt.show()

In [None]:
# scatter plot
sample = df.sample(min(10000, len(df)), random_state=42)
plt.figure(figsize=(10, 6))
plt.scatter(sample['avg_players'], sample['peak_players'], alpha=0.3)
plt.xlabel('Average Players')
plt.ylabel('Peak Players')
plt.title('Average vs Peak Players')
plt.xscale('log')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

## Summary Statistics

In [None]:
print("="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"Total records: {len(df):,}")
print(f"Unique games: {df['name'].nunique():,}")
print(f"Date range: {df['date'].min().strftime('%Y-%m')} to {df['date'].max().strftime('%Y-%m')}")
print(f"Time span: {(df['date'].max() - df['date'].min()).days / 365:.1f} years")
print()
print("Player Statistics:")
print(f"  Mean avg players: {df['avg_players'].mean():.0f}")
print(f"  Median avg players: {df['avg_players'].median():.0f}")
print(f"  Max avg players: {df['avg_players'].max():.0f}")
print(f"  Max peak players: {df['peak_players'].max():.0f}")
print()
print("Top 5 Games (by avg):")
for i, (game, avg) in enumerate(top_games.head(5).items(), 1):
    print(f"  {i}. {game}: {avg:.0f}")
print("="*60)

## Key Findings

1. **Highly Skewed Distribution**: The vast majority of games have very low player counts, with a small number of blockbuster titles dominating the platform. The median game has only ~11 average players per month.

2. **Top Performers**: Counter-Strike 2, PUBG, and GTA V are the clear leaders with player counts in the hundreds of thousands.

3. **Platform Growth**: The number of games tracked on Steam has grown significantly from 2012 to present, though there's been some decline in recent years.

4. **Seasonality**: There appears to be some seasonal variation, with January showing higher average player counts, possibly due to holiday gaming and new year releases.

5. **Volatility**: Most game-months show declining player counts (53%) vs growth (45%), suggesting high churn and the difficulty of maintaining a playerbase.

6. **Peak Behavior**: Peak player counts typically run 2-3x the average, showing significant variance in concurrent players throughout the day/week.