<a href="https://colab.research.google.com/github/brandonweber2022/INST-414-Project/blob/main/Sprint_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)

In [None]:
df = pd.read_csv("nba_2022-23_all_stats_with_salary.csv")

print("Shape:", df.shape)
df.info()
df.head()

In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

plt.figure(figsize=(10,5))
plt.imshow(df.isnull(), aspect="auto", interpolation="nearest")
plt.title("Missing Data Pattern")
plt.xlabel("Columns")
plt.ylabel("Rows")
plt.show()

In [None]:
for col in ["Unnamed: 0", "index"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Standardize multi-team labels
df["Team"] = df["Team"].astype(str).str.split("/").str[0].str.strip()

# Add salary in millions
df["Salary_M"] = df["Salary"] / 1_000_000

# Keep NaNs in shooting % columns
shoot_cols = ["FT%", "3P%", "2P%", "TS%", "3PAr", "FTr", "eFG%", "FG%"]
df_filled = df.copy()
df_filled[shoot_cols] = df_filled[shoot_cols].fillna(0)

# Check duplicates
dup_players = df["Player Name"].duplicated().sum()
print("Duplicate player names:", dup_players)

In [None]:
df["Salary_per_WS"] = df["Salary"] / df["WS"].replace(0, np.nan)
df["PTS_per_Million"] = df["PTS"] / df["Salary_M"].replace(0, np.nan)
df["log_Salary"] = np.log1p(df["Salary"])

df.head(3)

In [None]:
univariate_vars = ["Salary_M", "PTS", "PER", "WS", "BPM", "Age", "log_Salary"]
print(df[univariate_vars].describe().T)

for col in univariate_vars:
    plt.figure(figsize=(6,4))
    plt.hist(df[col].dropna(), bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
pairs = [("PER", "Salary_M"), ("WS", "Salary_M"), ("PTS", "Salary_M"), ("BPM", "Salary_M")]

for x, y in pairs:
    plt.figure(figsize=(6,4))
    plt.scatter(df[x], df[y], alpha=0.6)
    plt.title(f"{y} vs {x}")
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

In [None]:
corr_vars = ["Salary_M", "PTS", "PER", "WS", "BPM", "VORP", "USG%", "Age"]
corr = df[corr_vars].corr()

plt.figure(figsize=(8,6))
plt.imshow(corr, cmap="coolwarm", interpolation="nearest")
plt.xticks(range(len(corr_vars)), corr_vars, rotation=45)
plt.yticks(range(len(corr_vars)), corr_vars)
plt.title("Correlation Heatmap: Salary vs Performance")
plt.colorbar()
plt.show()

corr

In [None]:
def find_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower) | (df[column] > upper)]
    print(f"{column}: {len(outliers)} outliers detected")
    return outliers, lower, upper

outliers_salary, low_s, high_s = find_outliers_iqr(df, "Salary_M")
outliers_per, _, _ = find_outliers_iqr(df, "PER")
outliers_ws, _, _ = find_outliers_iqr(df, "WS")

# Preview salary outliers (likely superstars)
outliers_salary[["Player Name", "Team", "Salary_M", "PTS", "PER", "WS"]].head(10)

In [None]:
pos_summary = df.groupby("Position")["Salary_M"].describe()
print(pos_summary)

data_by_pos = [df.loc[df["Position"]==p, "Salary_M"].dropna() for p in pos_summary.index]
plt.boxplot(data_by_pos, labels=pos_summary.index)
plt.title("Salary Distribution by Position")
plt.xlabel("Position")
plt.ylabel("Salary (in millions)")
plt.show()

In [None]:
team_salary = df.groupby("Team")["Salary_M"].mean().sort_values(ascending=False)
plt.figure(figsize=(12,5))
plt.bar(team_salary.index, team_salary.values)
plt.xticks(rotation=90)
plt.title("Average Salary by Team (2022-23 Season)")
plt.xlabel("Team")
plt.ylabel("Average Salary (in millions)")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

data = df.copy()

data = data.dropna(subset=["Salary_M"])

numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
if "Salary_M" in numeric_cols:
    numeric_cols.remove("Salary_M")

X = data[numeric_cols]
y = data["Salary_M"]

X = X.fillna(X.median(numeric_only=True))

print("Number of features:", len(numeric_cols))
print("First 5 feature columns:", numeric_cols[:5])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


In [None]:
from sklearn.dummy import DummyRegressor
import time

baseline = DummyRegressor(strategy="mean")
baseline.fit(X_train, y_train)

y_pred_base = baseline.predict(X_test)

mae_base = mean_absolute_error(y_test, y_pred_base)
rmse_base = mean_squared_error(y_test, y_pred_base)
r2_base = r2_score(y_test, y_pred_base)

print("\n=== Baseline (Mean) ===")
print(f"MAE:  {mae_base:.3f}")
print(f"RMSE: {rmse_base:.3f}")
print(f"R²:   {r2_base:.3f}")

In [None]:
lin_reg = LinearRegression()
start = time.time()
lin_reg.fit(X_train, y_train)
lin_train_time = time.time() - start

y_pred_lr = lin_reg.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"MAE:  {mae_lr:.3f}")
print(f"RMSE: {rmse_lr:.3f}")
print(f"R²:   {r2_lr:.3f}")
print(f"Train time (s): {lin_train_time:.4f}")

In [None]:
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

start = time.time()
rf_reg.fit(X_train, y_train)
rf_train_time = time.time() - start

y_pred_rf = rf_reg.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\n=== Random Forest ===")
print(f"MAE:  {mae_rf:.3f}")
print(f"RMSE: {rmse_rf:.3f}")
print(f"R²:   {r2_rf:.3f}")
print(f"Train time (s): {rf_train_time:.4f}")

In [None]:
holdout_comparison = pd.DataFrame({
    "Model": [
        "Baseline (Mean)",
        "Linear Regression",
        "Random Forest"
    ],
    "MAE": [
        mae_base,
        mae_lr,
        mae_rf
    ],
    "RMSE": [
        rmse_base,
        rmse_lr,
        rmse_rf
    ],
    "R2": [
        r2_base,
        r2_lr,
        r2_rf
    ],
    "Train_Time_s": [
        baseline_train_time,
        lin_train_time,
        rf_train_time
    ],
    "Interpretability": [
        "Very High (trivial)",
        "High (coefficients)",
        "Medium (feature importance)"
    ]
})

print("\n=== Holdout Performance Comparison ===")
display(holdout_comparison.sort_values("RMSE"))

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error",
    "r2": "r2"
}

def cv_summary(name, model):
    cv_results = cross_validate(
        model, X, y,
        cv=kfold,
        scoring=scoring,
        return_train_score=False,
        n_jobs=-1
    )
    rmse = -cv_results["test_rmse"]
    mae = -cv_results["test_mae"]
    r2 = cv_results["test_r2"]
    return {
        "Model": name,
        "CV_MAE_Mean": mae.mean(),
        "CV_MAE_Std": mae.std(),
        "CV_RMSE_Mean": rmse.mean(),
        "CV_RMSE_Std": rmse.std(),
        "CV_R2_Mean": r2.mean(),
        "CV_R2_Std": r2.std()
    }