In [None]:
# XGBoost Regressor for WNBA dataset

# --- Libraries ---
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# --- Load & Clean Data ---
df = pd.read_csv("Full WNBA Dataset.csv")

# Drop 'Player_Comp_Share' if it exists
if 'Player_Comp_Share' in df.columns:
    df.drop(columns=['Player_Comp_Share'], inplace=True)

# Consolidate salary columns if both exist
if 'SALARY' in df.columns and 'Salary' in df.columns:
    df['SALARY'] = df['SALARY'].combine_first(df['Salary'])
elif 'Salary' in df.columns and 'SALARY' not in df.columns:
    df['SALARY'] = df['Salary']

# Drop rows without salary
df = df.dropna(subset=['SALARY']).reset_index(drop=True)

# --- Feature Setup ---
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if 'SALARY' in numeric_cols:
    numeric_cols.remove('SALARY')
if 'Player_Comp_Share' in numeric_cols:
    numeric_cols.remove('Player_Comp_Share')

# --- Compute Performance Index Based on Metrics ---
performance_cols = numeric_cols.copy()
perf_scaled = StandardScaler().fit_transform(df[performance_cols].fillna(0))
df['Performance_Index'] = perf_scaled.mean(axis=1)
df['Performance_Index'] = df['Performance_Index'].rank(pct=True) * 100
df['Performance_Index'] = df['Performance_Index'].round(2)

# --- Dynamic Tier Assignment Based on Performance Index ---
perf_values = df['Performance_Index'].values.reshape(-1, 1)

# Determine optimal number of tiers using KMeans inertia (elbow method)
inertia = []
K_range = range(2, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(perf_values)
    inertia.append(kmeans.inertia_)

# Optional: visualize elbow
plt.plot(K_range, inertia, marker='o')
plt.xlabel("Number of Performance Tiers")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal Number of Performance Tiers")
plt.show()

# For simplicity, pick 5 tiers (or choose from elbow visually)
optimal_tiers = 5
kmeans_perf = KMeans(n_clusters=optimal_tiers, random_state=42, n_init=10)
df['Tier'] = kmeans_perf.fit_predict(perf_values)

# Reverse tiers so Tier 1 = highest performance
tier_order = df.groupby('Tier')['Performance_Index'].mean().sort_values(ascending=False).index
tier_map = {old: new+1 for new, old in enumerate(tier_order)}
df['Tier'] = df['Tier'].map(tier_map)

# --- Predict Salary using XGBoost ---
X = df[numeric_cols].fillna(0)
y = df['SALARY'].values.astype(float)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    objective='reg:squarederror'
)
xgb.fit(X_train, y_train)

# --- Evaluate Salary Prediction ---
y_pred_test = xgb.predict(X_test)
print(f"XGBoost RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.0f}")
print(f"XGBoost R²: {r2_score(y_test, y_pred_test):.3f}")

# --- Feature Importance ---
feat_imp = pd.DataFrame({
    "feature": numeric_cols,
    "importance": xgb.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nTop XGBoost Feature Importances:")
print(feat_imp.head(15).to_string(index=False))

# --- Predict Salaries for All Players ---
X_all_scaled = scaler.transform(df[numeric_cols].fillna(0))
df['Predicted_Salary'] = xgb.predict(X_all_scaled)
df['Salary_Diff'] = df['SALARY'] - df['Predicted_Salary']

# --- Salary Ranges for Each Performance Tier ---
tier_salary_ranges = df.groupby('Tier')['Predicted_Salary'].agg(['min', 'max', 'mean', 'count']).round(0)
tier_salary_ranges.rename(columns={'min':'Min_Salary','max':'Max_Salary','mean':'Avg_Salary','count':'Num_Players'}, inplace=True)
print("\n--- Salary Ranges by Performance Tier ---")
print(tier_salary_ranges)

# --- Payment Status ---
df['Payment_Status'] = np.where(df['Salary_Diff'] < 0, 'Underpaid', 'Overpaid')

# --- Save Final CSV ---
output_cols = ['PLAYER', 'Season', 'Tier', 'Performance_Index', 'SALARY', 'Predicted_Salary', 'Salary_Diff', 'Payment_Status']
df.to_csv("Player_Performance_Tiers_And_Salary_Analysis.csv", index=False, columns=output_cols)
print("\n✅ Analysis saved to 'Player_Performance_Tiers_And_Salary_Analysis.csv'")


# --- Top 20 Players by Performance Index ---
print("\n--- Top 20 Players by Performance Index ---")
top_20 = df.sort_values(by='Performance_Index', ascending=False).head(20)
print(top_20[['PLAYER', 'Season', 'Performance_Index', 'Tier', 'SALARY', 'Predicted_Salary', 'Salary_Diff', 'Payment_Status']].to_string(index=False))



In [None]:
# XGBoost Regressor for NBA dataset

# --- Libraries ---
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# --- Load & Clean Data ---
df = pd.read_csv("All_Seasons_Clean.csv")

# Columns to drop
drop_cols = ['UNNAMED:_0', 'Unnamed:_0', 'DD2', 'TD3', '3P','3PA', 'Age', 'Min', '+/-', 'Id', 'REB']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

# Drop 'Player_Comp_Share' if it exists
if 'Player_Comp_Share' in df.columns:
    df.drop(columns=['Player_Comp_Share'], inplace=True)

# --- Consolidate Salary Columns ---
df['Salary_Final'] = df['SALARY'].combine_first(df['Salary'])

# --- Feature Setup ---
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
if 'SALARY' in numeric_cols:
    numeric_cols.remove('SALARY')
if 'Salary' in numeric_cols:
    numeric_cols.remove('Salary')
if 'Salary_Final' in numeric_cols:
    numeric_cols.remove('Salary_Final')

# Separate players with known salaries
df_known_salary = df.dropna(subset=['Salary_Final']).reset_index(drop=True)
df_missing_salary = df[df['Salary_Final'].isna()].reset_index(drop=True)

X_known = df_known_salary[numeric_cols].fillna(0)
y_known = df_known_salary['Salary_Final']

# --- Scaling ---
scaler = StandardScaler()
X_known_scaled = scaler.fit_transform(X_known)
X_all_scaled = scaler.transform(df[numeric_cols].fillna(0))  # scale all rows

# --- Split Data for evaluation (optional) ---
X_train, X_test, y_train, y_test = train_test_split(
    X_known_scaled, y_known, test_size=0.2, random_state=42
)

# --- XGBoost Model ---
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# --- Evaluate on test set ---
y_pred_test = xgb.predict(X_test)
print(f"XGBoost RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.0f}")
print(f"XGBoost R²: {r2_score(y_test, y_pred_test):.3f}")

# --- Feature Importance ---
feat_imp = pd.DataFrame({
    "feature": numeric_cols,
    "importance": xgb.feature_importances_
}).sort_values(by="importance", ascending=False)
print("\nTop XGBoost Feature Importances:")
print(feat_imp.head(15).to_string(index=False))

# --- Predict Salaries for ALL players ---
df['Predict_Salary'] = xgb.predict(X_all_scaled)

# Salary difference
df['Salary_Diff'] = df['Salary_Final'] - df['Predict_Salary']

# --- Compute Performance Index (0-100 percentile based on predicted salary) ---
df['Performance_Index'] = df['Predict_Salary'].rank(pct=True) * 100
df['Performance_Index'] = df['Performance_Index'].round(2)

# --- Create 10 Performance Tiers (Tier 1 = Best, Tier 10 = Lowest) ---
df = df.sort_values(by='Predict_Salary', ascending=False).reset_index(drop=True)
df['Tier'] = pd.qcut(df['Predict_Salary'], q=10, labels=False)
df['Tier'] = df['Tier'] + 1
df['Tier'] = 11 - df['Tier']  # Reverse so Tier 1 = best

# --- Salary Ranges for Each Tier ---
tier_salary_ranges = df.groupby('Tier')['Predict_Salary'].agg(['min', 'max', 'mean', 'count']).round(0)
tier_salary_ranges.rename(columns={'min': 'Min_Salary', 'max': 'Max_Salary', 'mean': 'Avg_Salary', 'count': 'Num_Players'}, inplace=True)
print("\n--- Salary Ranges by Tier ---")
print(tier_salary_ranges)

# --- Add Under/Overpaid Indicator for players with known salary ---
df['Payment_Status'] = np.where(df['Salary_Final'].notna(),
                                np.where(df['Salary_Diff'] > 0, 'Underpaid', 'Overpaid'),
                                np.nan)

# --- Save Full Player Analysis to CSV ---
df.to_csv("Player_Tiers_And_Salary_Analysis_Full.csv", index=False)
print("\n✅ Full analysis saved to 'Player_Tiers_And_Salary_Analysis_Full.csv'")

# --- Optional: Display sample player results ---
print("\n--- Sample Player Salary Analysis ---")
print(df[['PLAYER', 'Season', 'Tier', 'Performance_Index', 'Salary_Final', 'Predict_Salary', 'Salary_Diff', 'Payment_Status']].head(50).to_string(index=False))


In [None]:
# -----------------------------
# WNBA Player Salary Optimization with Random Forest & Performance Index
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 1. Load & Clean Data
# -----------------------------
df = pd.read_csv("Full WNBA Dataset.csv")

# Drop 'Player_Comp_Share' if it exists
if 'Player_Comp_Share' in df.columns:
    df.drop(columns=['Player_Comp_Share'], inplace=True)
    

# Consolidate salary columns if both exist
if 'SALARY' in df.columns and 'Salary' in df.columns:
    df['SALARY'] = df['SALARY'].combine_first(df['Salary'])
elif 'Salary' in df.columns and 'SALARY' not in df.columns:
    df['SALARY'] = df['Salary']

# Drop rows without salary
df = df.dropna(subset=['SALARY']).reset_index(drop=True)

# Keep player info
player_info = df[['PLAYER', 'Season']].copy()

# -----------------------------
# 2. Feature Setup
# -----------------------------
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ['SALARY', 'Player_Comp_Share']]

# -----------------------------
# 3. Compute Performance Index
# -----------------------------
perf_scaled = StandardScaler().fit_transform(df[numeric_cols].fillna(0))
df['Performance_Index'] = perf_scaled.mean(axis=1)
df['Performance_Index'] = df['Performance_Index'].rank(pct=True) * 100
df['Performance_Index'] = df['Performance_Index'].round(2)

# -----------------------------
# 4. Dynamic Tier Assignment via KMeans
# -----------------------------
perf_values = df['Performance_Index'].values.reshape(-1, 1)

# Optional: Elbow plot to visualize
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(perf_values)
    inertia.append(kmeans.inertia_)

plt.plot(range(2, 11), inertia, marker='o')
plt.xlabel("Number of Performance Tiers")
plt.ylabel("Inertia (Sum of Squared Distances)")
plt.title("Elbow Method for Optimal Performance Tiers")
plt.show()

# Choose 5 tiers for simplicity
optimal_tiers = 5
kmeans_perf = KMeans(n_clusters=optimal_tiers, random_state=42, n_init=10)
df['Tier'] = kmeans_perf.fit_predict(perf_values)

# Reverse tiers so Tier 1 = highest performance
tier_order = df.groupby('Tier')['Performance_Index'].mean().sort_values(ascending=False).index
tier_map = {old: new+1 for new, old in enumerate(tier_order)}
df['Tier'] = df['Tier'].map(tier_map)

# -----------------------------
# 5. Random Forest Regression for Salary Prediction
# -----------------------------
X = df[numeric_cols].fillna(0)
y = df['SALARY'].values.astype(float)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Evaluate
y_pred_test = rf.predict(X_test)
print(f"Random Forest RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.0f}")
print(f"Random Forest R²: {r2_score(y_test, y_pred_test):.3f}")


# -----------------------------
# 6. Feature Importance
# -----------------------------
feat_imp = pd.DataFrame({
    "feature": numeric_cols,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nTop Random Forest Feature Importances:")
print(feat_imp.head(15).to_string(index=False))

# -----------------------------
# 7. Predict Salaries for All Players
# -----------------------------
X_all_scaled = scaler.transform(df[numeric_cols].fillna(0))
df['Predicted_Salary'] = rf.predict(X_all_scaled)
df['Salary_Diff'] = df['SALARY'] - df['Predicted_Salary']

# -----------------------------
# 8. Salary Ranges by Performance Tier
# -----------------------------
tier_salary_ranges = df.groupby('Tier')['Predicted_Salary'].agg(['min', 'max', 'mean', 'count']).round(0)
tier_salary_ranges.rename(columns={'min':'Min_Salary','max':'Max_Salary','mean':'Avg_Salary','count':'Num_Players'}, inplace=True)
print("\n--- Salary Ranges by Performance Tier ---")
print(tier_salary_ranges)

# -----------------------------
# 9. Payment Status
# -----------------------------
df['Payment_Status'] = np.where(df['Salary_Diff'] < 0, 'Underpaid', 'Overpaid')

# -----------------------------
# 10. Save Final CSV
# -----------------------------
output_cols = ['PLAYER', 'Season', 'Tier', 'Performance_Index', 'SALARY', 'Predicted_Salary', 'Salary_Diff', 'Payment_Status']
df.to_csv("Player_Performance_Tiers_And_Salary_RF.csv", index=False, columns=output_cols)
print("\n✅ Analysis saved to 'Player_Performance_Tiers_And_Salary_RF.csv'")

# -----------------------------
# 11. Display Top Players
# -----------------------------


print("\n--- Top 20 Players by Performance Index ---")
top_20 = df.sort_values(by='Performance_Index', ascending=False).head(20)
print(top_20[output_cols].to_string(index=False))


In [None]:
# -----------------------------
# NBA Player Salary Optimization with Random Forest & Performance Index
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# -----------------------------
# 1. Load & Clean Data
# -----------------------------
df = pd.read_csv("All_Seasons_Clean.csv")

# Drop unnecessary columns if they exist
drop_cols = ['UNNAMED:_0', 'Unnamed:_0', 'DD2', '3PA', 'TD3', '3P',
             'Age', 'Min', '+/-', 'Id', 'FT%', 'FG%', 'FTA', 'FGA', 'REB']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors='ignore')

# Drop 'Player_Comp_Share' if it exists
df.drop(columns=['Player_Comp_Share'], inplace=True, errors='ignore')

# --- Consolidate Salary Columns ---
df['Salary_Final'] = df['SALARY'].combine_first(df['Salary']) if 'SALARY' in df.columns and 'Salary' in df.columns else df.get('SALARY', df.get('Salary'))

# Separate rows with known salaries
df_known_salary = df.dropna(subset=['Salary_Final']).reset_index(drop=True)
df_missing_salary = df[df['Salary_Final'].isna()].reset_index(drop=True)

# Keep player info for export later
player_info = df[['PLAYER', 'Season']].copy()

# -----------------------------
# 2. Feature Setup
# -----------------------------
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

# Remove salary-related columns from features
for col in ['SALARY', 'Salary', 'Salary_Final']:
    if col in numeric_cols:
        numeric_cols.remove(col)

X_known = df_known_salary[numeric_cols].fillna(0)
y_known = df_known_salary['Salary_Final'].astype(float)

# -----------------------------
# 3. Compute Performance Index (Using Scaled Feature Average)
# -----------------------------
scaler_perf = StandardScaler()
X_all_perf = df[numeric_cols].fillna(0)
X_all_scaled_perf = scaler_perf.fit_transform(X_all_perf)

df['Performance_Index'] = X_all_scaled_perf.mean(axis=1)
df['Performance_Index'] = df['Performance_Index'].rank(pct=True) * 100
df['Performance_Index'] = df['Performance_Index'].round(2)

# -----------------------------
# 4. Random Forest Regression for Salary Prediction
# -----------------------------
scaler = StandardScaler()
X_known_scaled = scaler.fit_transform(X_known)

X_train, X_test, y_train, y_test = train_test_split(
    X_known_scaled, y_known, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Evaluate
y_pred_test_rf = rf.predict(X_test)
print(f"Random Forest RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test_rf)):.0f}")
print(f"Random Forest R²: {r2_score(y_test, y_pred_test_rf):.3f}")

# -----------------------------
# 5. Feature Importance
# -----------------------------
feat_imp = pd.DataFrame({
    "feature": numeric_cols,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nTop Random Forest Feature Importances:")
print(feat_imp.head(15).to_string(index=False))

# -----------------------------
# 6. Predict Salaries for ALL Players (Known + Missing)
# -----------------------------
X_all = df[numeric_cols].fillna(0)
X_all_scaled = scaler.transform(X_all)
df['Predict_Salary'] = rf.predict(X_all_scaled)

# Salary difference
df['Salary_Diff'] = df['Salary_Final'] - df['Predict_Salary']

# -----------------------------
# 7. Performance Tiers Based on Predicted Salary
# -----------------------------
df = df.sort_values(by='Predict_Salary', ascending=False).reset_index(drop=True)
df['Tier'] = pd.qcut(df['Predict_Salary'], q=10, labels=False)
df['Tier'] = 11 - (df['Tier'] + 1)  # Reverse so Tier 1 = best

# -----------------------------
# 8. Salary Ranges by Tier
# -----------------------------
tier_salary_ranges = df.groupby('Tier')['Predict_Salary'].agg(['min', 'max', 'mean', 'count']).round(0)
tier_salary_ranges.rename(columns={'min':'Min_Salary','max':'Max_Salary','mean':'Avg_Salary','count':'Num_Players'}, inplace=True)
print("\n--- Salary Ranges by Performance Tier ---")
print(tier_salary_ranges)

# -----------------------------
# 9. Payment Status (Only for players with known salary)
# -----------------------------
df['Payment_Status'] = np.where(df['Salary_Final'].notna(),
                                np.where(df['Salary_Diff'] > 0, 'Underpaid', 'Overpaid'),
                                np.nan)

# -----------------------------
# 10. Save Final CSV
# -----------------------------
output_cols = ['PLAYER', 'Season', 'Tier', 'Performance_Index', 'Salary_Final', 'Predict_Salary', 'Salary_Diff', 'Payment_Status']
df.to_csv("Player_Tiers_And_Salary_RF.csv", index=False, columns=output_cols)
print("\n✅ Full analysis saved to 'Player_Tiers_And_Salary_RF.csv'")

# -----------------------------
# 11. Display Sample Output
# -----------------------------
print("\n--- Sample Player Salary Analysis ---")
print(df[output_cols].head(50).to_string(index=False))
