In [2]:
# --- Libraries ---
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# --- Load & Clean Data ---
df = pd.read_csv("All_Seasons_Clean.csv")

# Columns to drop
drop_cols = ['UNNAMED:_0', 'Unnamed:_0', 'DD2', 'TD3', '3P','3PA', 'Salary', 'Age', 'Min', '+/-', 'Id', 'REB']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

# Drop 'Player_Comp_Share' if it exists
if 'Player_Comp_Share' in df.columns:
    df.drop(columns=['Player_Comp_Share'], inplace=True)

# Drop rows with missing salary data
df = df.dropna(subset=['SALARY']).reset_index(drop=True)

# --- Feature Setup ---
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove('SALARY')
X = df[numeric_cols].fillna(0)
y = df['SALARY']

# --- Scaling ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# --- XGBoost Model ---
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# --- Evaluate ---
y_pred_test = xgb.predict(X_test)
print(f"XGBoost RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.0f}")
print(f"XGBoost R²: {r2_score(y_test, y_pred_test):.3f}")

# --- Feature Importance ---
feat_imp = pd.DataFrame({
    "feature": numeric_cols,
    "importance": xgb.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nTop XGBoost Feature Importances:")
print(feat_imp.head(15).to_string(index=False))

# --- Predict Salaries for All Players ---
df['Predicted_Salary'] = xgb.predict(X_scaled)
df['Salary_Diff'] = df['SALARY'] - df['Predicted_Salary']

# --- Compute Performance Index (0-100 Percentile Based on Predicted Salary) ---
df['Performance_Index'] = df['Predicted_Salary'].rank(pct=True) * 100
df['Performance_Index'] = df['Performance_Index'].round(2)

# --- Create 10 Performance Tiers (Tier 1 = Best, Tier 10 = Lowest) ---
df = df.sort_values(by='Predicted_Salary', ascending=False).reset_index(drop=True)

# Assign tiers (0-9), where 0 is highest salary
df['Tier'] = pd.qcut(df['Predicted_Salary'], q=10, labels=False)
df['Tier'] = df['Tier'] + 1
df['Tier'] = 11 - df['Tier']  # Reverse so Tier 1 is the highest

# --- Salary Ranges for Each Tier ---
tier_salary_ranges = df.groupby('Tier')['Predicted_Salary'].agg(['min', 'max', 'mean', 'count']).round(0)
tier_salary_ranges.rename(columns={'min': 'Min_Salary', 'max': 'Max_Salary', 'mean': 'Avg_Salary', 'count': 'Num_Players'}, inplace=True)
print("\n--- Salary Ranges by Tier ---")
print(tier_salary_ranges)

# --- Add Under/Overpaid Indicator ---
df['Payment_Status'] = np.where(df['Salary_Diff'] > 0, 'Underpaid', 'Overpaid')

# --- Save Detailed Player Output (Including Season and Performance Index) ---
df[['PLAYER', 'Season', 'Tier', 'Performance_Index', 'SALARY', 'Predicted_Salary', 'Salary_Diff', 'Payment_Status']].to_csv(
    "Player_Tiers_And_Salary_Analysis.csv", index=False
)
print("\n✅ Analysis saved to 'Player_Tiers_And_Salary_Analysis.csv'")

# --- Optional: Display sample player results ---
print("\n--- Sample Player Salary Analysis ---")
print(df[['PLAYER', 'Season', 'Tier', 'Performance_Index', 'SALARY', 'Predicted_Salary', 'Salary_Diff', 'Payment_Status']].head(10).to_string(index=False))


XGBoost RMSE: 2930740
XGBoost R²: 0.593

Top XGBoost Feature Importances:
feature  importance
     FP    0.379349
    AGE    0.080611
    FGM    0.071855
    FTA    0.042036
    FGA    0.038424
   DREB    0.031454
    BLK    0.028346
    PTS    0.027907
    TOV    0.027045
      L    0.023641
     PF    0.023190
    STL    0.023018
    FT%    0.022908
  ERROR    0.022655
    MIN    0.022253

--- Salary Ranges by Tier ---
      Min_Salary  Max_Salary  Avg_Salary  Num_Players
Tier                                                 
1     10757597.0  29270268.0  14269432.0          528
2      6881777.0  10715908.0   8577825.0          527
3      4842902.0   6878850.0   5758034.0          527
4      3681079.0   4840014.0   4236556.0          528
5      2855777.0   3674198.0   3244025.0          527
6      2227702.0   2855328.0   2545370.0          527
7      1735319.0   2227686.0   1974551.0          528
8      1258071.0   1733752.0   1490483.0          527
9       872292.0   1256156.0   1068

In [14]:
# --- Libraries ---
import pandas as pd
import numpy as np

# --- Load WNBA Data ---
wnba = pd.read_csv("Full WNBA Dataset.csv")

# --- Fill missing numeric values with 0 ---
numeric_cols = ['FP','AGE','FGM','FGA','FTA','DREB','BLK','PTS','TOV','L','PF','STL','FT%','ERROR','MIN']
for col in numeric_cols:
    if col in wnba.columns:
        wnba[col] = wnba[col].fillna(0)

# --- Feature Importances from XGBoost (already provided) ---
feat_importance = {
    'FP': 0.379349,
    'AGE': 0.080611,
    'FGM': 0.071855,
    'FTA': 0.042036,
    'FGA': 0.038424,
    'DREB': 0.031454,
    'BLK': 0.028346,
    'PTS': 0.027907,
    'TOV': 0.027045,
    'L': 0.023641,
    'PF': 0.023190,
    'STL': 0.023018,
    'FT%': 0.022908,
    'ERROR': 0.022655,
    'MIN': 0.022253
}

# --- Compute Weighted Performance Index ---
# Multiply each feature by its importance and sum
wnba['Performance_Index_Raw'] = 0
for feature, weight in feat_importance.items():
    if feature in wnba.columns:
        wnba['Performance_Index_Raw'] += wnba[feature] * weight

# --- Normalize to 0-100 ---
wnba['Performance_Index'] = 100 * (wnba['Performance_Index_Raw'] - wnba['Performance_Index_Raw'].min()) / (
    wnba['Performance_Index_Raw'].max() - wnba['Performance_Index_Raw'].min()
)
wnba['Performance_Index'] = wnba['Performance_Index'].round(2)

# --- Create 10 Performance Tiers (Tier 1 = Best) ---
wnba = wnba.sort_values(by='Performance_Index', ascending=False).reset_index(drop=True)
wnba['Tier'] = pd.qcut(wnba['Performance_Index'], q=10, labels=False)
wnba['Tier'] = 11 - (wnba['Tier'] + 1)  # Reverse so Tier 1 is highest

# --- Save Results ---
wnba[['PLAYER','Season','Tier','Performance_Index']].to_csv("WNBA_Performance_Index.csv", index=False)

print("✅ WNBA Performance Index computed and saved!")
print(wnba[['PLAYER','Season','Tier','Performance_Index']].head(20).to_string(index=False))


✅ WNBA Performance Index computed and saved!
          PLAYER  Season  Tier  Performance_Index
     a'ja wilson    2024     1             100.00
     jewell loyd    2023     1              93.74
arike ogunbowale    2024     1              92.65
     a'ja wilson    2025     1              91.55
 breanna stewart    2023     1              90.24
    tina charles    2021     1              90.10
napheesa collier    2025     1              89.69
napheesa collier    2023     1              87.66
arike ogunbowale    2023     1              87.62
     a'ja wilson    2023     1              87.34
 breanna stewart    2021     1              86.14
   caitlin clark    2024     1              85.71
napheesa collier    2024     1              85.39
 brittney griner    2021     1              85.19
 breanna stewart    2024     1              85.16
     jewell loyd    2024     1              84.85
 breanna stewart    2022     1              84.70
     a'ja wilson    2021     1              83.40
  nne