In [0]:
import pyspark.sql.functions as F
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import mlflow.sklearn
from mlflow.models import infer_signature
import mlflow
from mlflow.tracking import MlflowClient

In [0]:
ENV = "dev"

feature_schema = f"fpl_feature_{ENV}"

In [0]:
# Read the player_features table and convert to pandas
player_features_df = spark.read.table(f"{feature_schema}.player_features").filter(
    F.col("season_key") >= 202122
)

player_features_pd = player_features_df.toPandas()

In [0]:
import numpy as np

feature_cols = [
    "was_home", "rolling_expected_goals", "rolling_expected_assists", "rolling_expected_goal_involvements", "rolling_goals_scored", "rolling_assists", "rolling_total_points", "rolling_minutes", "rolling_clean_sheets", "rolling_bps", "rolling_ict_index", "rolling_influence", "rolling_creativity", "rolling_threat", "rolling_defensive_contribution", "rolling_clearances_blocks_interceptions", "rolling_bonus", "rolling_saves", "rolling_games_played", "rolling_minutes_points", "rolling_assist_points", "rolling_goal_points", "rolling_clean_sheet_points", "rolling_defensive_contribution_points", "rolling_penalty_miss_points", "rolling_goals_conceded_points", "rolling_yellow_card_points", "rolling_red_card_points", "rolling_own_goal_points", "avg_expected_goals", "avg_expected_assists", "avg_expected_goal_involvements", "avg_goals_scored", "avg_assists", "avg_total_points", "avg_minutes", "avg_clean_sheets", "avg_bps", "avg_ict_index", "avg_influence", "avg_creativity", "avg_threat", "avg_defensive_contribution", "avg_clearances_blocks_interceptions", "avg_bonus", "avg_saves", "avg_minutes_points", "avg_assist_points", "avg_goal_points", "avg_clean_sheet_points", "avg_defensive_contribution_points", "avg_penalty_miss_points", "avg_goals_conceded_points", "avg_yellow_card_points", "avg_red_card_points", "avg_own_goal_points", "rolling_points", "rolling_team_expected_goals", "rolling_expected_goals_against", "rolling_goal_difference", "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements", "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against", "avg_goal_difference", "match_points", "team_rolling_goals_conceded", "team_rolling_goal_difference", "player_share_of_team_xG", "player_share_of_team_points", "opponent_rolling_points", "opponent_rolling_team_expected_goals", "opponent_rolling_expected_goals_against", "opponent_rolling_goal_difference", "opponent_avg_team_expected_goals", "opponent_avg_team_expected_assists", "opponent_avg_team_expected_goal_involvements", "opponent_avg_expected_goals_against", "opponent_avg_expected_assists_against", "opponent_avg_expected_goal_involvements_against", "opponent_avg_goal_difference"
]

# Fill missing values and cast to float
player_features_pd[feature_cols] = player_features_pd[feature_cols].fillna(-1)
player_features_pd[feature_cols] = player_features_pd[feature_cols].astype(float)

target_col = "total_points"

# Show a sample of the feature matrix
player_features_pd[feature_cols + [target_col]].head()

In [0]:
from sklearn.model_selection import train_test_split

X = player_features_pd[feature_cols]
y = player_features_pd[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set count: {len(X_train)}")
print(f"Test set count: {len(X_test)}")

In [0]:
models = [
    ("RandomForest", RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)),
    ("GradientBoosting", GradientBoostingRegressor(n_estimators=100, max_depth=10, random_state=42)),
    ("Ridge", Ridge(alpha=1.0)),
    ("Lasso", Lasso(alpha=0.1)),
    ("SVR", SVR(C=1.0, epsilon=0.2))
]

client = MlflowClient()
results = []
for name, model in models:
    with mlflow.start_run(run_name=f"{name}_regression") as run:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred) ** 0.5  # Fix: remove 'squared', take sqrt
        r2 = r2_score(y_test, y_pred)
        input_example = X_train.head(5)
        output_example = model.predict(X_train.head(5))
        signature = infer_signature(input_example, output_example)
        # Log parameters
        if hasattr(model, 'get_params'):
            for param, value in model.get_params().items():
                mlflow.log_param(param, value)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example
        )
        run_id = run.info.run_id
        model_name = f"FPL_TotalPoints_{name}_SKLearn_21"
        model_uri = f"runs:/{run_id}/model"
        result = mlflow.register_model(model_uri, model_name)
        results.append((name, rmse, r2, result.version))
        print(f"{name} model registered as '{model_name}' with version: {result.version}. RMSE: {rmse:.4f}, R2: {r2:.4f}")

print("All models trained and registered.")
for name, rmse, r2, version in results:
    print(f"{name}: RMSE={rmse:.4f}, R2={r2:.4f}, Version={version}")