Basic Summary and Data Cleaning

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../data/processed/indiv_stats_avg.csv")                

print(df.columns)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values[missing_values > 0])

# Summary statistics
print(df.describe())

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

1️⃣ Identify Key Differences Between All-Stars and Non-All-Stars from 2011 - 2024


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../data/processed/indiv_stats_avg.csv")   


# Gathering the top 9 factors that I think are crucial when considering an all-star selection
crucial_stats = [
    'points', 'fieldGoalsPercentage', 'threePointersMade', 'assists', 
    'plusMinusPoints', 'steals', 'blocks', 'reboundsTotal', 'minutes'
]


# Creating the boxplots
plt.figure(figsize=(15, 12))
for i, crucial_stats in enumerate(crucial_stats, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df, x=crucial_stats, hue="allStar", kde=True, bins=30)
    plt.title(f"Distribution of {crucial_stats}")

plt.tight_layout()
plt.show()

2️⃣ Visualize Trends in PPG, PER, Usage Rate, and Plus-Minus

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../data/processed/combined_nba_boxscores.csv")


# Gathering the top 9 factors that I think are crucial when considering an all-star selection
crucial_stats = [
    'points', 'fieldGoalsPercentage', 'threePointersMade', 'assists', 
    'plusMinusPoints', 'steals', 'blocks', 'reboundsTotal', 'turnovers'
]

# Grouping the season stats but separating them based on all-stars vs non-all-stars
all_star_df = df[df["allStar"] == 1]
non_all_star_df = df[df["allStar"] == 0]

all_star_season_stats = all_star_df.groupby("season_year")[crucial_stats].mean().reset_index()
non_all_star_season_stats = non_all_star_df.groupby("season_year")[crucial_stats].mean().reset_index()


# Creating the line plots
plt.figure(figsize=(18, 18))
for i, crucial_stats in enumerate(crucial_stats, 1):
    plt.subplot(3, 3, i)
    sns.lineplot(x=all_star_season_stats["season_year"], y=all_star_season_stats[crucial_stats], label="All Star")
    sns.lineplot(x=non_all_star_season_stats["season_year"], y=non_all_star_season_stats[crucial_stats], label="Non-All Star")
    plt.title(f"Player Performace Trend for {crucial_stats})")
    plt.xlabel("Season Year")
    plt.ylabel("Average Statistic")

plt.show()

3️⃣ Create Correlation Heatmaps

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../data/processed/indiv_stats_avg.csv")   


# Select only numeric columns
df_numeric = df.select_dtypes(include=['number'])


# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_numeric.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

4️⃣ Feature Importance Plots (Using SHAP)

In [None]:
import shap
import joblib
import numpy as np

# Load trained model
model = joblib.load("../models/allstar_model_new.pkl")


# Ensure X contains the expected features from the model
missing_features = set(model.feature_names_in_) - set(X.columns)


# If missing features exist, reload them from the original dataset
if missing_features:
    print(f"Missing features found: {missing_features}")
    
    # Reload full dataset to get missing features
    df_original = pd.read_csv("../data/processed/indiv_stats_avg.csv")

    # Ensure X matches the trained model's features
    X = df_original[model.feature_names_in_]


# Ensure X has the correct features
X_train_columns = model.feature_names_in_


# Initialize SHAP Explainer with additivity disabled
explainer = shap.TreeExplainer(model)


# Compute SHAP values using predict_proba instead of predict
shap_values = explainer.shap_values(X, check_additivity=False)


# Plot summary for class 1 (All-Star probability)
shap_values_class1 = shap_values[:, :, 1]  # Select SHAP values for class 1
shap.summary_plot(shap_values_class1, X, max_display = 30)

