In [None]:
# ğŸ“¦ Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ğŸ“¥ Load player per-game stats from Basketball Reference
url_players = "https://www.basketball-reference.com/leagues/NBA_2024_per_game.html"
player_tables = pd.read_html(url_players)
player_stats = player_tables[0]

# Clean player stats
player_stats = player_stats[player_stats['Rk'] != 'Rk']  # remove header rows
player_stats = player_stats.drop(columns=['Rk'])
player_stats = player_stats.apply(pd.to_numeric, errors='ignore')

# ğŸ“¥ Load team stats
url_teams = "https://www.basketball-reference.com/leagues/NBA_2024.html"
team_tables = pd.read_html(url_teams)
team_stats = team_tables[0]

# Show basic info
print("Player stats shape:", player_stats.shape)
print("Team stats shape:", team_stats.shape)

# ğŸ’¥ Exploratory Data Analysis (EDA)
print(player_stats.describe())

# Top scorers
top_scorers = player_stats.sort_values(by='PTS', ascending=False).head(10)
print("Top 10 scorers:\n", top_scorers[['Player', 'PTS', 'AST', 'TRB']])

# Correlation matrix on numeric columns only
numeric_cols = player_stats.select_dtypes(include=['number'])
plt.figure(figsize=(10,8))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Player Stats")
plt.show()

print(team_stats.columns)

# Points vs Wins (using team stats if available)
# You would need to merge team_stats with win data; assuming 'W' column exists:
if 'W' in team_stats.columns:
    plt.figure(figsize=(8,6))
    sns.scatterplot(x='PS/G', y='W', data=team_stats)
    plt.title("Team Points vs Wins")
    plt.xlabel("Points Per Game")
    plt.ylabel("Wins")
    plt.show()


# ğŸ’¡ Classify players by position and scoring tier
player_stats['PTS'] = pd.to_numeric(player_stats['PTS'], errors='coerce')
conditions = [
    (player_stats['PTS'] >= 25),
    (player_stats['PTS'] >= 18) & (player_stats['PTS'] < 25),
    (player_stats['PTS'] >= 10) & (player_stats['PTS'] < 18),
    (player_stats['PTS'] < 10)
]
choices = ['Elite', 'Star', 'Good', 'Average']
player_stats['Performance'] = pd.cut(player_stats['PTS'], bins=[0,10,18,25,50], labels=['Average','Good','Star','Elite'], right=False)

# Count players by performance level
performance_counts = player_stats['Performance'].value_counts()
print("Player counts by performance level:\n", performance_counts)

# ğŸ“Š Visualize performance levels
plt.figure(figsize=(8,6))
sns.countplot(x='Performance', data=player_stats, order=['Elite','Star','Good','Average'])
plt.title("Player Performance Levels")
plt.show()

# ğŸ“¦ Save cleaned dataset (optional)
player_stats.to_csv("nba2024_cleaned.csv", index=False)
