In [None]:
%pip install matplotlib seaborn scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("phase2.1_enhanced_fps_no_roles.csv")
print(df.columns.tolist())
df.head()

In [None]:
expected_features = [
    "Total_Matches", "Total_Bt_Runs", "Total_Bt_Balls",
    "Total_Bw_Runs", "Total_Bw_Balls", "Total_Wkts",
    "Strike_Rate", "Economy_Rate", "SR_Bonus_Points", "Econ_Bonus_Points"
]

available_features = [col for col in expected_features if col in df.columns]

if "Enhanced_FPS" in df.columns:
    target = "Enhanced_FPS"
elif "Adjusted_FPS" in df.columns:
    target = "Adjusted_FPS"
else:
    raise KeyError("❌ No valid target column found (Enhanced_FPS or Adjusted_FPS).")

X = df[available_features].replace([np.inf, -np.inf], np.nan).fillna(0)
y = df[target]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
%pip install -U scikit-learn

In [None]:

from sklearn.metrics import mean_squared_error
import numpy as np

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)
mse = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse)


print(f"Linear Regression | R²: {r2_lr:.3f} | RMSE: {rmse_lr:.3f}")

In [None]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print(f"Random Forest | R²: {r2_rf:.3f} | RMSE: {rmse_rf:.3f}")

In [None]:
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='r2')
print(f"Random Forest Cross-Validation R² Mean: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


In [None]:
# Visualize model performance
perf = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "R²": [r2_lr, r2_rf],
    "RMSE": [rmse_lr, rmse_rf]
})
display(perf)

sns.barplot(data=perf, x="Model", y="R²", hue="Model", palette="Blues_d", legend=False)
plt.title("Model Comparison (R²)")
plt.show()

In [None]:
importances = pd.DataFrame({
    "Feature": available_features,
    "Importance": rf.feature_importances_
}).sort_values("Importance", ascending=False)

print("Top 10 Influential Features:")
display(importances.head(10))

plt.figure(figsize=(8,5))
sns.barplot(data=importances.head(10), x="Importance", y="Feature", hue="Feature", palette="Greens_r")
plt.title("Feature Importance (Random Forest)")
plt.show()

In [None]:
df["Predicted_Fantasy_Score"] = rf.predict(X)
df_sorted = df.sort_values("Predicted_Fantasy_Score", ascending=False)

output_file = "phase3_predicted_fantasy_scores.csv"
df_sorted.to_csv(output_file, index=False)

display(df_sorted[["player_name", target, "Predicted_Fantasy_Score"]].head(15))