In [0]:
%pip install xgboost

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import pyspark.sql.functions as F 
from sklearn.metrics import mean_squared_error, r2_score
import mlflow.sklearn
from mlflow.models import infer_signature
import mlflow
from mlflow.tracking import MlflowClient
from xgboost import XGBRegressor

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "prod"


# Validate ENV
valid_envs = {"dev", "test", "prod"}

if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")
    
feature_schema = f"fpl_feature_{ENV}"

In [0]:
# Read the player_features table and convert to pandas
player_features_df = spark.read.table(f"{feature_schema}.player_features").filter(
    F.col("season_key") >= 202122
)

player_features_pd = player_features_df.toPandas()

In [0]:
import numpy as np

feature_cols = [
    "was_home", "rolling_expected_goals", "rolling_expected_assists", "rolling_expected_goal_involvements", "rolling_goals_scored", "rolling_assists", "rolling_total_points", "rolling_minutes", "rolling_clean_sheets", "rolling_bps", "rolling_ict_index", "rolling_influence", "rolling_creativity", "rolling_threat", "rolling_defensive_contribution", "rolling_clearances_blocks_interceptions", "rolling_bonus", "rolling_saves", "rolling_games_played", "rolling_minutes_points", "rolling_assist_points", "rolling_goal_points", "rolling_clean_sheet_points", "rolling_defensive_contribution_points", "rolling_penalty_miss_points", "rolling_goals_conceded_points", "rolling_yellow_card_points", "rolling_red_card_points", "rolling_own_goal_points", "avg_expected_goals", "avg_expected_assists", "avg_expected_goal_involvements", "avg_goals_scored", "avg_assists", "avg_total_points", "avg_minutes", "avg_clean_sheets", "avg_bps", "avg_ict_index", "avg_influence", "avg_creativity", "avg_threat", "avg_defensive_contribution", "avg_clearances_blocks_interceptions", "avg_bonus", "avg_saves", "avg_minutes_points", "avg_assist_points", "avg_goal_points", "avg_clean_sheet_points", "avg_defensive_contribution_points", "avg_penalty_miss_points", "avg_goals_conceded_points", "avg_yellow_card_points", "avg_red_card_points", "avg_own_goal_points", "rolling_points", "rolling_team_expected_goals", "rolling_expected_goals_against", "rolling_goal_difference", "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements", "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against", "avg_goal_difference", "team_rolling_goals_conceded", "team_rolling_goal_difference", "player_share_of_team_xG", "player_share_of_team_points", "opponent_rolling_points", "opponent_rolling_team_expected_goals", "opponent_rolling_expected_goals_against", "opponent_rolling_goal_difference", "opponent_avg_team_expected_goals", "opponent_avg_team_expected_assists", "opponent_avg_team_expected_goal_involvements", "opponent_avg_expected_goals_against", "opponent_avg_expected_assists_against", "opponent_avg_expected_goal_involvements_against", "opponent_avg_goal_difference"
]

# Fill missing values and cast to float
player_features_pd[feature_cols] = player_features_pd[feature_cols].fillna(-1)
player_features_pd[feature_cols] = player_features_pd[feature_cols].astype(float)

target_col = "total_points"

# Show a sample of the feature matrix
player_features_pd[feature_cols + [target_col]].head()

Unnamed: 0,was_home,rolling_expected_goals,rolling_expected_assists,rolling_expected_goal_involvements,rolling_goals_scored,rolling_assists,rolling_total_points,rolling_minutes,rolling_clean_sheets,rolling_bps,rolling_ict_index,rolling_influence,rolling_creativity,rolling_threat,rolling_defensive_contribution,rolling_clearances_blocks_interceptions,rolling_bonus,rolling_saves,rolling_games_played,rolling_minutes_points,rolling_assist_points,rolling_goal_points,rolling_clean_sheet_points,rolling_defensive_contribution_points,rolling_penalty_miss_points,rolling_goals_conceded_points,rolling_yellow_card_points,rolling_red_card_points,rolling_own_goal_points,avg_expected_goals,avg_expected_assists,avg_expected_goal_involvements,avg_goals_scored,avg_assists,avg_total_points,avg_minutes,avg_clean_sheets,avg_bps,avg_ict_index,avg_influence,...,avg_clearances_blocks_interceptions,avg_bonus,avg_saves,avg_minutes_points,avg_assist_points,avg_goal_points,avg_clean_sheet_points,avg_defensive_contribution_points,avg_penalty_miss_points,avg_goals_conceded_points,avg_yellow_card_points,avg_red_card_points,avg_own_goal_points,rolling_points,rolling_team_expected_goals,rolling_expected_goals_against,rolling_goal_difference,avg_team_expected_goals,avg_team_expected_assists,avg_team_expected_goal_involvements,avg_expected_goals_against,avg_expected_assists_against,avg_expected_goal_involvements_against,avg_goal_difference,team_rolling_goals_conceded,team_rolling_goal_difference,player_share_of_team_xG,player_share_of_team_points,opponent_rolling_points,opponent_rolling_team_expected_goals,opponent_rolling_expected_goals_against,opponent_rolling_goal_difference,opponent_avg_team_expected_goals,opponent_avg_team_expected_assists,opponent_avg_team_expected_goal_involvements,opponent_avg_expected_goals_against,opponent_avg_expected_assists_against,opponent_avg_expected_goal_involvements_against,opponent_avg_goal_difference,total_points
0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9.0,6.5,7.33,1.0,1.3,1.05,2.35,1.466,0.962,2.428,0.2,7.33,1.0,-1.0,-1.0,9.0,5.94,5.84,2.0,1.188,0.758,1.946,1.168,0.858,2.026,0.4,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,5.85,7.75,1.0,1.17,0.916,2.086,1.55,0.956,2.506,0.2,7.75,1.0,0.0,0.0,2.0,4.28,7.47,-6.0,0.856,0.41,1.266,1.494,1.206,2.7,-1.2,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.03,7.79,1.0,1.406,1.04,2.446,1.558,0.92,2.478,0.2,7.79,1.0,0.0,0.0,6.0,7.02,6.48,-5.0,1.404,1.146,2.55,1.296,0.77,2.066,-1.0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.03,7.23,2.0,1.606,1.114,2.72,1.446,0.738,2.184,0.4,7.23,2.0,0.0,0.0,11.0,7.99,6.34,4.0,1.598,1.138,2.736,1.268,0.492,1.76,0.8,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,8.02,6.94,3.0,1.604,1.012,2.616,1.388,0.818,2.206,0.6,6.94,3.0,0.0,0.0,6.0,9.59,8.85,1.0,1.918,1.012,2.93,1.77,0.868,2.638,0.2,0


In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

X = player_features_pd[feature_cols]
y = player_features_pd[target_col]
season_key = player_features_pd["season_key"]

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in splitter.split(X, season_key):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print(f"Training set count: {len(X_train)}")
print(f"Test set count: {len(X_test)}")

Training set count: 120975
Test set count: 30244


In [0]:
# Define hyperparameters for GradientBoostingRegressor
best_params = {
    "n_estimators": 200,
    "max_depth": 15,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8
}

model = XGBRegressor(
    random_state=42,
    tree_method="hist",
    verbosity=0,
    **best_params
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)
input_example = X_train.head(5)
output_example = model.predict(X_train.head(5))
signature = infer_signature(input_example, output_example)

model_name = "FPL_TotalPoints_XGBoost_v2"
alias_name = "champion"
client = MlflowClient()

# Try to get current champion model metrics
try:
    current_version = client.get_model_version_by_alias(model_name, alias_name)
    current_run_id = current_version.run_id
    current_metrics = client.get_run(current_run_id).data.metrics
    current_rmse = current_metrics.get("rmse", float("inf"))
    current_r2 = current_metrics.get("r2", float("-inf"))
    has_champion = True
except Exception:
    has_champion = False
    current_rmse = float("inf")
    current_r2 = float("-inf")

# Register new model if metrics are better or if no champion exists
if (not has_champion) or ((r2 > current_r2) and (rmse < current_rmse)):
    with mlflow.start_run(run_name="XGBoost_Retrained") as run:
        for param, value in model.get_params().items():
            mlflow.log_param(param, value)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example
        )
        run_id = run.info.run_id
        model_uri = f"runs:/{run_id}/model"
        result = mlflow.register_model(model_uri, model_name)
        client.set_registered_model_alias(model_name, alias_name, result.version)
        print(f"New champion registered: '{model_name}' version {result.version}. RMSE: {rmse:.4f}, R2: {r2:.4f}")
else:
    print(f"Model not registered. Current champion RMSE: {current_rmse:.4f}, R2: {current_r2:.4f}. New RMSE: {rmse:.4f}, R2: {r2:.4f}")

Model not registered. Current champion RMSE: 1.7752, R2: 0.4194. New RMSE: 1.8086, R2: 0.4179
