In [0]:
%pip install xgboost    


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from functools import reduce
import mlflow
from xgboost import XGBRegressor

In [0]:
def write_to_table(
    df: DataFrame,
    table_name: str,
    mode: str = "overwrite",
    merge_schema: bool = True,
    partition_by: list[str] = None,
    path: str = None,
    save_as_table: bool = True
) -> None:
    """
    Generalised Delta write helper for bronze layer.

    Parameters:
    - df (DataFrame): Spark DataFrame to write.
    - table_name (str): Name of the Delta table (used if save_as_table=True).
    - mode (str): Write mode ('overwrite', 'append', 'ignore', 'error', etc.).
    - merge_schema (bool): Whether to merge schema on write.
    - partition_by (list[str], optional): List of columns to partition by.
    - path (str, optional): Path to save the Delta table (used if save_as_table=False).
    - save_as_table (bool): If True, saves as managed table; else saves to path.

    Raises:
    - ValueError: If neither save_as_table nor path is properly specified.
    """

    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    writer = df_with_ts.write.format("delta").mode(mode)

    if merge_schema:
        writer = writer.option("mergeSchema", "true")
    elif mode == "overwrite":
        writer = writer.option("overwriteSchema", "true")

    if partition_by:
        writer = writer.partitionBy(*partition_by)

    if save_as_table:
        writer.saveAsTable(table_name)
    elif path:
        writer.save(path)
    else:
        raise ValueError("Either save_as_table must be True or a path must be provided.")

In [0]:
def merge_to_table(
    df: DataFrame,
    table_name: str,
    merge_condition: str,
    spark: SparkSession,
    partition_by: list[str] = None
) -> None:
    """
    Performs an upsert (merge) into a Delta table.

    Parameters:
    - df (DataFrame): Incoming DataFrame to merge.
    - table_name (str): Target Delta table name.
    - merge_condition (str): SQL condition for matching rows.
    - spark (SparkSession): Active Spark session.
    - partition_by (list[str], optional): Columns to partition by on initial write.

    If the table does not exist, it will be created using write_to_table.
    """
    df_with_ts = df.withColumn("last_updated", F.current_timestamp())

    if not spark.catalog.tableExists(table_name):
        write_to_table(
            df=df_with_ts,
            table_name=table_name,
            partition_by=partition_by
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("target")
            .merge(
                source=df_with_ts.alias("source"),
                condition=merge_condition
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )

In [0]:
def build_player_features(
    gameweek_stats_df,
    rolling_window_size,
    silver_schema,
    training = True
):
    player_base_window = Window.partitionBy("player_surrogate_key", "season_key", "team_season_key").orderBy("gameweek_key", "fixture_key")
    
    if training:
        player_rolling_window = player_base_window.rowsBetween(-rolling_window_size, -1) #if training, current fixture needs previous weeks stats
    else:
        player_rolling_window = player_base_window.rowsBetween(-rolling_window_size + 1, 0) #if inference, current fixture needs to include latest stats

    player_first_gw =  F.row_number().over(player_base_window) == 1

    # Calculate rolling player stats
    player_rolling_exprs = {
        "rolling_expected_goals": F.when(player_first_gw, None).otherwise(F.sum("expected_goals").over(player_rolling_window)),
        "rolling_expected_assists": F.when(player_first_gw, None).otherwise(F.sum("expected_assists").over(player_rolling_window)),
        "rolling_expected_goal_involvements": F.when(player_first_gw, None).otherwise(F.sum("expected_goal_involvements").over(player_rolling_window)),
        "rolling_goals_scored": F.when(player_first_gw, None).otherwise(F.sum("goals_scored").over(player_rolling_window)),
        "rolling_assists": F.when(player_first_gw, None).otherwise(F.sum("assists").over(player_rolling_window)),
        "rolling_total_points": F.when(player_first_gw, None).otherwise(F.sum("total_points").over(player_rolling_window)),
        "rolling_minutes": F.when(player_first_gw, None).otherwise(F.sum("minutes").over(player_rolling_window)),
        "rolling_clean_sheets": F.when(player_first_gw, None).otherwise(F.sum("clean_sheets").over(player_rolling_window)),
        "rolling_bps": F.when(player_first_gw, None).otherwise(F.sum("bps").over(player_rolling_window)),
        "rolling_ict_index": F.when(player_first_gw, None).otherwise(F.sum("ict_index").over(player_rolling_window)),
        "rolling_influence": F.when(player_first_gw, None).otherwise(F.sum("influence").over(player_rolling_window)),
        "rolling_creativity": F.when(player_first_gw, None).otherwise(F.sum("creativity").over(player_rolling_window)),
        "rolling_threat": F.when(player_first_gw, None).otherwise(F.sum("threat").over(player_rolling_window)),
        "rolling_defensive_contribution": F.when(player_first_gw, None).otherwise(F.sum("defensive_contribution").over(player_rolling_window)),
        "rolling_clearances_blocks_interceptions": F.when(player_first_gw, None).otherwise(F.sum("clearances_blocks_interceptions").over(player_rolling_window)),
        "rolling_bonus": F.when(player_first_gw, None).otherwise(F.sum("bonus").over(player_rolling_window)),
        # Position-specific: saves only for GK
        "rolling_saves": F.when(player_first_gw, None).otherwise(F.sum(F.when(F.col("position_key") == 1, F.col("saves")).otherwise(None)).over(player_rolling_window)),
        "rolling_games_played": F.when(player_first_gw, None).otherwise(F.count("fixture_key").over(player_rolling_window)),
        # Rolling FPL points
        "rolling_minutes_points": F.when(player_first_gw, None).otherwise(F.sum("minutes_points").over(player_rolling_window)),
        "rolling_assist_points": F.when(player_first_gw, None).otherwise(F.sum("assist_points").over(player_rolling_window)),
        "rolling_goal_points": F.when(player_first_gw, None).otherwise(F.sum("goal_points").over(player_rolling_window)),
        "rolling_clean_sheet_points": F.when(player_first_gw, None).otherwise(F.sum("clean_sheet_points").over(player_rolling_window)),
        "rolling_defensive_contribution_points": F.when(player_first_gw, None).otherwise(F.sum("defensive_contribution_points").over(player_rolling_window)),
        "rolling_penalty_miss_points": F.when(player_first_gw, None).otherwise(F.sum("penalty_miss_points").over(player_rolling_window)),
        "rolling_goals_conceded_points": F.when(player_first_gw, None).otherwise(F.sum("goals_conceded_points").over(player_rolling_window)),
        "rolling_yellow_card_points": F.when(player_first_gw, None).otherwise(F.sum("yellow_card_points").over(player_rolling_window)),
        "rolling_red_card_points": F.when(player_first_gw, None).otherwise(F.sum("red_card_points").over(player_rolling_window)),
        "rolling_own_goal_points": F.when(player_first_gw, None).otherwise(F.sum("own_goal_points").over(player_rolling_window))
    }

    player_features_df = gameweek_stats_df.withColumns(player_rolling_exprs)

    # Calculate rolling averages to 3 decimal places, set to null for first gameweek
    player_avg_exprs = {
        f"avg_{k[8:]}": F.when(
            player_first_gw,
            None
        ).otherwise(
            F.round(
                F.when(
                    F.col("rolling_games_played") != 0,
                    F.col(k) / F.col("rolling_games_played")
                ).otherwise(None),
                3
            )
        )
        for k in player_rolling_exprs if k != "rolling_games_played"
    }
    player_features_df = player_features_df.withColumns(player_avg_exprs)

    # Join with team_features for contextual strength
    pf = player_features_df.alias("pf")

    team_features_df = build_team_features(
        gameweek_stats_df = gameweek_stats_df,
        rolling_window_size = rolling_window_size,
        silver_schema = silver_schema,
        training = training
    )

    tf = team_features_df.alias("tf")

    team_strength_cols = [
        F.col("team_season_key").alias("_team_season_key"), F.col("season_key").alias("t_season_key"), F.col("gameweek_key").alias("team_gamweek_key"),
        F.col("fixture_key").alias("team_fixture_key"),
        "rolling_points", "rolling_team_expected_goals",
        "rolling_expected_goals_against", "rolling_goal_difference",
        "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
        "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
        "avg_goal_difference", "match_points"
    ]

    tf = tf.select(*team_strength_cols)

    pf = pf.join(
        tf,
        (pf["team_season_key"] == tf["_team_season_key"]) &
        (pf["season_key"] == tf["t_season_key"]) &
        (pf["gameweek_key"] == tf["team_gamweek_key"]) &
        (pf["fixture_key"] == tf["team_fixture_key"]),
        how="left"
    ).drop("_team_season_key", "t_season_key", "team_gamweek_key", "team_fixture_key")

    # Join opponent team strength features
    tf_opp = team_features_df.alias("tf_opp")
    opponent_team_strength_cols = [
        F.col("team_season_key").alias("_opponent_team_season_key"),
        F.col("season_key").alias("opponent_season_key"),
        F.col("gameweek_key").alias("opponent_gameweek_key"),
        F.col("fixture_key").alias("opponent_fixture_key"),
        F.col("rolling_points").alias("opponent_rolling_points"),
        F.col("rolling_team_expected_goals").alias("opponent_rolling_team_expected_goals"),
        F.col("rolling_expected_goals_against").alias("opponent_rolling_expected_goals_against"),
        F.col("rolling_goal_difference").alias("opponent_rolling_goal_difference"),
        F.col("avg_team_expected_goals").alias("opponent_avg_team_expected_goals"),
        F.col("avg_team_expected_assists").alias("opponent_avg_team_expected_assists"),
        F.col("avg_team_expected_goal_involvements").alias("opponent_avg_team_expected_goal_involvements"),
        F.col("avg_expected_goals_against").alias("opponent_avg_expected_goals_against"),
        F.col("avg_expected_assists_against").alias("opponent_avg_expected_assists_against"),
        F.col("avg_expected_goal_involvements_against").alias("opponent_avg_expected_goal_involvements_against"),
        F.col("avg_goal_difference").alias("opponent_avg_goal_difference")
    ]

    tf_opp = team_features_df.select(*opponent_team_strength_cols)

    pf = pf.join(
        tf_opp,
        (pf["opponent_team_season_key"] == tf_opp["_opponent_team_season_key"]) &
        (pf["season_key"] == tf_opp["opponent_season_key"]) &
        (pf["gameweek_key"] == tf_opp["opponent_gameweek_key"]) &
        (pf["fixture_key"] == tf_opp["opponent_fixture_key"]),
        how="left"
    ).drop("_opponent_team_season_key", "opponent_season_key", "opponent_gameweek_key", "opponent_fixture_key")

    # For GK/DEF/MID, add team defensive rolling stats
    player_features_df = pf.withColumns({
        "team_rolling_goals_conceded": F.when(F.col("position_key").isin([1,2,3]), F.col("rolling_expected_goals_against")).otherwise(None),
        "team_rolling_goal_difference": F.when(F.col("position_key").isin([1,2,3]), F.col("rolling_goal_difference")).otherwise(None)
    })

    # Add ratios of team share
    player_features_df = player_features_df.withColumns({
        "player_share_of_team_xG": F.round(
            F.when(
                F.col("rolling_team_expected_goals") != 0,
                F.col("rolling_expected_goals") / F.col("rolling_team_expected_goals")
            ).otherwise(None),
            3
        ),
        "player_share_of_team_points": F.round(
            F.when(
                F.col("rolling_points") != 0,
                F.col("rolling_total_points") / F.col("rolling_points")
            ).otherwise(None),
            3
        )
    })

    # Select final columns, including raw stats from stats_df and match_points from team_features
    raw_stats_cols = [
        "fixture_key", "player_id", "player_key", "player_surrogate_key", "player_season_key", "player_fixture_key", "team_season_key", "season_key", "gameweek_key", "position_key", "opponent_team_season_key", "was_home", "exp_stats_available", "def_con_available", "total_points"
    ]

    final_cols = [
        *raw_stats_cols,
        # Rolling stats
        *list(player_rolling_exprs.keys()),
        # Rolling averages
        *list(player_avg_exprs.keys()),
        # Team/contextual features
        "rolling_points", "rolling_team_expected_goals", "rolling_expected_goals_against", "rolling_goal_difference",
        "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
        "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
        "avg_goal_difference",
        "match_points",
        "team_rolling_goals_conceded", "team_rolling_goal_difference",
        "player_share_of_team_xG", "player_share_of_team_points",
        # Opponent team strength features
        "opponent_rolling_points", "opponent_rolling_team_expected_goals",
        "opponent_rolling_expected_goals_against", "opponent_rolling_goal_difference",
        "opponent_avg_team_expected_goals", "opponent_avg_team_expected_assists", "opponent_avg_team_expected_goal_involvements",
        "opponent_avg_expected_goals_against", "opponent_avg_expected_assists_against", "opponent_avg_expected_goal_involvements_against",
        "opponent_avg_goal_difference"
    ]

    player_features_df = player_features_df.select(*final_cols)

    player_features_df = player_features_df.withColumns({
        "was_home": F.col("was_home").cast("int"),
        "exp_stats_available": F.col("exp_stats_available").cast("int"),
        "def_con_available": F.col("def_con_available").cast("int")
    })

    return player_features_df

In [0]:
def build_team_features(
    gameweek_stats_df,
    rolling_window_size,
    silver_schema,
    training = True
):    
        
    fixtures_df = spark.read.table(f"{silver_schema}.fixtures").filter(F.col("home_team_score").isNotNull())

    teams_df = spark.read.table(f"{silver_schema}.teams")

    #Aggregate xG, xA, and exp_stats_available per team per fixture
    team_xg_xa_df = gameweek_stats_df.groupBy(
        "fixture_key", "team_season_key"
        ).agg(
            F.sum("expected_goals").alias("team_expected_goals"),
            F.sum("expected_assists").alias("team_expected_assists"),
            F.max("exp_stats_available").alias("team_exp_stats_available")
        ).withColumn(
            "team_expected_goal_involvements",
            F.col("team_expected_goals") + F.col("team_expected_assists")
        )

    #Create opponent xG/xA aggregates
    opponent_xg_xa_df = team_xg_xa_df.select(
            "fixture_key",
            F.col("team_season_key").alias("opponent_team_season_key"),
            F.col("team_expected_goals").alias("expected_goals_against"),
            F.col("team_expected_assists").alias("expected_assists_against"),
            F.col("team_expected_goal_involvements").alias("expected_goal_involvements_against")
        )

    #Transform fixtures into team-level records
    home_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("home_team_season_key").alias("team_season_key"),
            F.col("away_team_season_key").alias("opponent_team_season_key"),
            F.lit(True).alias("is_home"),
            F.col("home_team_score").alias("goals_for"),
            F.col("away_team_score").alias("goals_against")
        )

    away_df = fixtures_df.select(
            "fixture_key",
            "season_key",
            "gameweek_key",
            F.col("away_team_season_key").alias("team_season_key"),
            F.col("home_team_season_key").alias("opponent_team_season_key"),
            F.lit(False).alias("is_home"),
            F.col("away_team_score").alias("goals_for"),
            F.col("home_team_score").alias("goals_against")
        )

    team_fixtures_df = home_df.unionByName(away_df)

    # Window specs for rolling metrics up to previous GW
    base_window = Window.partitionBy("team_season_key", "season_key").orderBy("gameweek_key")

    if training:
        rolling_window = base_window.rowsBetween(-rolling_window_size, -1)
    else:
        rolling_window = base_window.rowsBetween(-rolling_window_size + 1, 0)

    first_gw = F.row_number().over(base_window) == 1
    
    #Add match-level metrics
    team_fixtures_df = team_fixtures_df.withColumns({
        "goal_diff": F.col("goals_for") - F.col("goals_against"),
        "match_points": F.when(F.col("goals_for") > F.col("goals_against"), F.lit(3))
                        .when(F.col("goals_for") == F.col("goals_against"), F.lit(1))
                        .otherwise(F.lit(0))
    })

    #Join team xG/xA and opponent xG/xA
    team_fixtures_df = team_xg_xa_df.join(
        team_fixtures_df, 
        on=["fixture_key", "team_season_key"], 
        how="left"
        ).join(
            opponent_xg_xa_df, 
            on=["fixture_key", "opponent_team_season_key"], 
            how="left"
        )

    # Rolling metrics expressions
    rolling_exprs = {
        "rolling_points": F.when(first_gw, None).otherwise(F.sum("match_points").over(rolling_window)),
        "home_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "away_rolling_points": F.when(first_gw, None).otherwise(F.sum(F.when(~F.col("is_home"), F.col("match_points")).otherwise(0)).over(rolling_window)),
        "rolling_team_expected_goals": F.when(first_gw, None).otherwise(F.sum("team_expected_goals").over(rolling_window)),
        "rolling_team_expected_assists": F.when(first_gw, None).otherwise(F.sum("team_expected_assists").over(rolling_window)),
        "rolling_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.sum("team_expected_goal_involvements").over(rolling_window)),
        "rolling_expected_goals_against": F.when(first_gw, None).otherwise(F.sum("expected_goals_against").over(rolling_window)),
        "rolling_expected_assists_against": F.when(first_gw, None).otherwise(F.sum("expected_assists_against").over(rolling_window)),
        "rolling_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.sum("expected_goal_involvements_against").over(rolling_window)),
        "rolling_goal_difference": F.when(first_gw, None).otherwise(F.sum("goal_diff").over(rolling_window)),
        "rolling_games_played": F.when(first_gw, None).otherwise(F.count("fixture_key").over(rolling_window)),
        "avg_team_expected_goals": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goals") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_assists": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_assists") / F.col("rolling_games_played"), 3)),
        "avg_team_expected_goal_involvements": F.when(first_gw, None).otherwise(F.round(F.col("rolling_team_expected_goal_involvements") / F.col("rolling_games_played"), 3)),
        "avg_expected_goals_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goals_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_assists_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_assists_against") / F.col("rolling_games_played"), 3)),
        "avg_expected_goal_involvements_against": F.when(first_gw, None).otherwise(F.round(F.col("rolling_expected_goal_involvements_against") / F.col("rolling_games_played"), 3)),
        "avg_goal_difference": F.when(first_gw, None).otherwise(F.round(F.col("rolling_goal_difference") / F.col("rolling_games_played"), 3))
    }

    team_fixtures_df = team_fixtures_df.withColumns(rolling_exprs)

    #Join team metadata
    team_features_df = team_fixtures_df.join(
        teams_df.select("team_season_key", "team_name", "team_name_short", "is_promoted", "is_relegated", "season_key"),
        on=["team_season_key", "season_key"],
        how="left"
    )

    #Select final columns
    team_features_df = team_features_df.select(
        "team_season_key", "team_name", "team_name_short", "season_key", "gameweek_key", "fixture_key",
        "is_home", "goals_for", "goals_against", "goal_diff", "match_points",
        "team_expected_goals", "team_expected_assists", "team_expected_goal_involvements",
        "expected_goals_against", "expected_assists_against", "expected_goal_involvements_against",
        "team_exp_stats_available",
        "rolling_points", "home_rolling_points", "away_rolling_points",
        "rolling_team_expected_goals", "rolling_team_expected_assists", "rolling_team_expected_goal_involvements",
        "rolling_expected_goals_against", "rolling_expected_assists_against", "rolling_expected_goal_involvements_against",
        "rolling_goal_difference", "rolling_games_played",
        "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements",
        "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against",
        "avg_goal_difference",
        "is_promoted", "is_relegated"
    )

    return team_features_df

In [0]:
try:
    ENV = dbutils.widgets.get("ENV")
except Exception:
    ENV = "prod"

try:
    PROTOCOL = dbutils.widgets.get("PROTOCOL")
except Exception:
    PROTOCOL = "INCR"

#ensure valid ENV and PROTOCOL
valid_protocol = {"HIST", "INCR"}
valid_envs = {"dev", "test", "prod"}

# Validate PROTOCOL
if PROTOCOL not in valid_protocol:
    print(f"Invalid PROTOCOL: {PROTOCOL}. Must be one of {valid_protocol}. Exiting notebook.")
    dbutils.notebook.exit("Invalid PROTOCOL")

# Validate ENV
if ENV not in valid_envs:
    print(f"Invalid ENV: {ENV}. Must be one of {valid_envs}. Exiting notebook.")
    dbutils.notebook.exit("Invalid ENV")
    
silver_schema = f"fpl_silver_{ENV}"
feature_schema = f"fpl_feature_{ENV}"

rolling_window_size = 5

In [0]:
fixtures_df = spark.read.table(f"{silver_schema}.fixtures")
gameweek_stats_df = spark.read.table(f"{silver_schema}.gameweek_stats")
players_df = spark.read.table(f"{silver_schema}.players")
team_features_df = spark.read.table(f"{feature_schema}.team_features")
player_features_df = spark.read.table(f"{feature_schema}.player_features")

In [0]:
feature_cols = [
    "was_home", "rolling_expected_goals", "rolling_expected_assists", "rolling_expected_goal_involvements", "rolling_goals_scored", "rolling_assists", "rolling_total_points", "rolling_minutes", "rolling_clean_sheets", "rolling_bps", "rolling_ict_index", "rolling_influence", "rolling_creativity", "rolling_threat", "rolling_defensive_contribution", "rolling_clearances_blocks_interceptions", "rolling_bonus", "rolling_saves", "rolling_games_played", "rolling_minutes_points", "rolling_assist_points", "rolling_goal_points", "rolling_clean_sheet_points", "rolling_defensive_contribution_points", "rolling_penalty_miss_points", "rolling_goals_conceded_points", "rolling_yellow_card_points", "rolling_red_card_points", "rolling_own_goal_points", "avg_expected_goals", "avg_expected_assists", "avg_expected_goal_involvements", "avg_goals_scored", "avg_assists", "avg_total_points", "avg_minutes", "avg_clean_sheets", "avg_bps", "avg_ict_index", "avg_influence", "avg_creativity", "avg_threat", "avg_defensive_contribution", "avg_clearances_blocks_interceptions", "avg_bonus", "avg_saves", "avg_minutes_points", "avg_assist_points", "avg_goal_points", "avg_clean_sheet_points", "avg_defensive_contribution_points", "avg_penalty_miss_points", "avg_goals_conceded_points", "avg_yellow_card_points", "avg_red_card_points", "avg_own_goal_points", "rolling_points", "rolling_team_expected_goals", "rolling_expected_goals_against", "rolling_goal_difference", "avg_team_expected_goals", "avg_team_expected_assists", "avg_team_expected_goal_involvements", "avg_expected_goals_against", "avg_expected_assists_against", "avg_expected_goal_involvements_against", "avg_goal_difference", "team_rolling_goals_conceded", "team_rolling_goal_difference", "player_share_of_team_xG", "player_share_of_team_points", "opponent_rolling_points", "opponent_rolling_team_expected_goals", "opponent_rolling_expected_goals_against", "opponent_rolling_goal_difference", "opponent_avg_team_expected_goals", "opponent_avg_team_expected_assists", "opponent_avg_team_expected_goal_involvements", "opponent_avg_expected_goals_against", "opponent_avg_expected_assists_against", "opponent_avg_expected_goal_involvements_against", "opponent_avg_goal_difference"
]

In [0]:
max_gameweek_row = team_features_df.agg(F.max("gameweek_key").alias("max_gameweek_key")).collect()[0]
max_gameweek = max_gameweek_row["max_gameweek_key"]

next_gameweek_range_start_key = max_gameweek + 1
next_gameweek_range_end_key = max_gameweek + 5

if PROTOCOL == 'INCR':

    next_fixtures_df = fixtures_df.filter(
        F.col("gameweek_key").between(next_gameweek_range_start_key, next_gameweek_range_end_key)
    ).select(
        "fixture_key", 
        "home_team_season_key", 
        "away_team_season_key", 
        "gameweek_key", 
        "season_key"
    )

    teams_next_fixtures_df = next_fixtures_df.select(
        "fixture_key", 
        "gameweek_key", 
        "season_key",
        F.col("home_team_season_key").alias("team_season_key"),
        F.col("away_team_season_key").alias("opponent_team_season_key"),
        F.lit(1).alias("was_home")
    ).unionByName(
        next_fixtures_df.select(
            "fixture_key", 
            "gameweek_key", 
            "season_key",
            F.col("away_team_season_key").alias("team_season_key"),
            F.col("home_team_season_key").alias("opponent_team_season_key"),
            F.lit(0).alias("was_home")
        )
    )

    players_next_fixtures_df = teams_next_fixtures_df.alias("tf").join(
        players_df.alias("pf"),
        (F.col("tf.team_season_key") == F.col("pf.team_season_key")) &
        (F.col("pf.effective_to") == max_gameweek),
        "inner"
    ).select(
        F.col("tf.fixture_key"),
        F.col("tf.team_season_key"),
        F.col("tf.opponent_team_season_key"),
        F.col("tf.season_key"),
        F.col("tf.gameweek_key"),
        F.col("pf.player_surrogate_key"),
        F.col("tf.was_home").cast("int").alias("was_home")
    )

    current_season_key = gameweek_stats_df.agg(F.max("season_key").alias("max_season_key")).collect()[0]["max_season_key"]
    stats_df = gameweek_stats_df.filter(F.col("season_key") == current_season_key)

    player_features_df = build_player_features(
        gameweek_stats_df = stats_df,
        rolling_window_size = rolling_window_size,
        silver_schema = silver_schema,
        training = False
    )

    window_spec = Window.partitionBy("player_surrogate_key").orderBy(F.col("gameweek_key").desc(), F.col("fixture_key").desc())
    player_features_max_df = player_features_df.withColumn(
        "rn", 
        F.row_number().over(window_spec)
        ).filter(
            F.col("rn") == 1
        ).drop("rn")

    feature_prefixes = ("rolling", "avg")
    player_feature_cols = [
            c for c in player_features_max_df.columns
            if c.startswith(feature_prefixes)
        ]

    player_current_features_df = player_features_max_df.select(
        "player_surrogate_key",
        "position_key",
        "exp_stats_available",
        "def_con_available",
        *player_feature_cols
        )

    team_features_df = build_team_features(
        gameweek_stats_df = stats_df,
        rolling_window_size = rolling_window_size,
        silver_schema = silver_schema,
        training = False
    )

    window_spec = Window.partitionBy("team_season_key").orderBy(F.col("gameweek_key").desc())
    team_features_max_df = team_features_df.withColumn(
        "rn", 
        F.row_number().over(window_spec)
        ).filter(
            F.col("rn") == 1
        ).drop("rn")

    opponent_team_features_df = team_features_max_df.select(
        F.col("team_season_key").alias("opponent_team_season_key"),
        F.col("rolling_points").alias("opponent_rolling_points"),
        F.col("rolling_team_expected_goals").alias("opponent_rolling_team_expected_goals"),
        F.col("rolling_expected_goals_against").alias("opponent_rolling_expected_goals_against"),
        F.col("rolling_goal_difference").alias("opponent_rolling_goal_difference"),
        F.col("avg_team_expected_goals").alias("opponent_avg_team_expected_goals"),
        F.col("avg_team_expected_assists").alias("opponent_avg_team_expected_assists"),
        F.col("avg_team_expected_goal_involvements").alias("opponent_avg_team_expected_goal_involvements"),
        F.col("avg_expected_goals_against").alias("opponent_avg_expected_goals_against"),
        F.col("avg_expected_assists_against").alias("opponent_avg_expected_assists_against"),
        F.col("avg_expected_goal_involvements_against").alias("opponent_avg_expected_goal_involvements_against"),
        F.col("avg_goal_difference").alias("opponent_avg_goal_difference")
    )

    player_inference_features_df = players_next_fixtures_df.join(
        player_current_features_df,
        on="player_surrogate_key",
        how="inner"
    ).join(
        opponent_team_features_df,
        on="opponent_team_season_key",
        how="left"
    )

    model_name = "FPL_TotalPoints_XGBoost_v2"
    model_alias = "champion"
    model_uri = f"models:/{model_name}@{model_alias}"

    for col in feature_cols:
        if col not in player_inference_features_df.columns:
            player_inference_features_df = player_inference_features_df.withColumn(col, F.lit(-1))
    player_inference_features_df = player_inference_features_df.fillna(-1, subset=feature_cols).withColumn(
        "player_fixture_key",F.concat(F.col("player_surrogate_key"), F.col("fixture_key")).cast("long")
    ).select("player_fixture_key", "player_surrogate_key", "fixture_key", *feature_cols)

    player_inference_pdf = player_inference_features_df.toPandas()

    sklearn_model = mlflow.sklearn.load_model(model_uri)
    player_inference_pdf["predicted_total_points"] = sklearn_model.predict(player_inference_pdf[feature_cols])

    player_inference_spark_df = spark.createDataFrame(player_inference_pdf).select(
        "player_fixture_key",
        "player_surrogate_key",
        "fixture_key",
        "predicted_total_points"
    )

    merge_to_table(
            df = player_inference_spark_df,
            table_name = f"{feature_schema}.player_points_inference",
            merge_condition = "source.player_fixture_key = target.player_fixture_key",
            spark = spark
    )

elif PROTOCOL == 'HIST':
    max_season_key = spark.read.table(f"{silver_schema}.fixtures").agg(F.max("season_key").alias("max_season_key")).collect()[0]["max_season_key"]

    fixtures_hist_df = spark.read.table(f"{silver_schema}.fixtures").filter(F.col("season_key") == max_season_key)
    
    gameweek_keys = [
        row["gameweek_key"]
        for row in (
            fixtures_hist_df.select("gameweek_key")
            .distinct()
            .filter(
                (~F.col("gameweek_key").endswith("01")) &
                (F.col("gameweek_key") <= max_gameweek)
            )
            .orderBy("gameweek_key")
            .collect()
        )
    ]

    model_name = "FPL_TotalPoints_XGBoost_v2"
    model_alias = "champion"
    model_uri = f"models:/{model_name}@{model_alias}"
    sklearn_model = mlflow.sklearn.load_model(model_uri)

    for gw in gameweek_keys:

        next_gameweek_range_start_key = gw + 1
        next_gameweek_range_end_key = gw + 5

        gw_fixtures_df = fixtures_hist_df.filter(
            F.col("gameweek_key").between(next_gameweek_range_start_key, next_gameweek_range_end_key)
        ).select(
            "fixture_key", "home_team_season_key", "away_team_season_key", "gameweek_key", "season_key"
        )

        teams_gw_fixtures_df = gw_fixtures_df.select(
            "fixture_key", "gameweek_key", "season_key",
            F.col("home_team_season_key").alias("team_season_key"),
            F.col("away_team_season_key").alias("opponent_team_season_key"),
            F.lit(1).alias("was_home")
        ).unionByName(
            gw_fixtures_df.select(
                "fixture_key", "gameweek_key", "season_key",
                F.col("away_team_season_key").alias("team_season_key"),
                F.col("home_team_season_key").alias("opponent_team_season_key"),
                F.lit(0).alias("was_home")
            )
        )

        # Get players for each fixture (as of that gameweek)
        players_gw_df = players_df.filter(
            (F.col("effective_from") <= gw) &
            (F.col("effective_to") >= gw)
            )
        
        players_gw_fixtures_df = teams_gw_fixtures_df.alias("tf").join(
            players_gw_df.alias("pf"),
            F.col("tf.team_season_key") == F.col("pf.team_season_key"),
            "inner"
        ).select(
            F.col("tf.fixture_key"),
            F.col("tf.team_season_key"),
            F.col("tf.opponent_team_season_key"),
            F.col("tf.season_key"),
            F.col("tf.gameweek_key"),
            F.col("pf.player_surrogate_key"),
            F.col("tf.was_home").cast("int").alias("was_home")
        )

        stats_df = gameweek_stats_df.filter(
            (F.col("season_key") == max_season_key) & (F.col("gameweek_key") < gw)
        )

        player_features_df = build_player_features(
            gameweek_stats_df = stats_df,
            rolling_window_size = rolling_window_size,
            silver_schema = silver_schema,
            training = False
        )

        window_spec = Window.partitionBy("player_surrogate_key").orderBy(F.col("gameweek_key").desc())
        player_features_max_df = player_features_df.withColumn(
            "rn", 
            F.row_number().over(window_spec)
        ).filter(
            F.col("rn") == 1
        ).drop("rn")

        feature_prefixes = ("rolling", "avg")
        player_feature_cols = [
            c for c in player_features_max_df.columns
            if c.startswith(feature_prefixes)
        ]

        player_current_features_df = player_features_max_df.select(
            "player_surrogate_key",
            "position_key",
            "exp_stats_available",
            "def_con_available",
            *player_feature_cols
            )

        team_features_df = build_team_features(
            gameweek_stats_df = stats_df,
            rolling_window_size = rolling_window_size,
            silver_schema = silver_schema
        )

        window_spec = Window.partitionBy("team_season_key").orderBy(F.col("gameweek_key").desc())
        team_features_max_df = team_features_df.withColumn(
            "rn", 
            F.row_number().over(window_spec)
        ).filter(
            F.col("rn") == 1
        ).drop("rn")

        opponent_team_features_df = team_features_max_df.select(
            F.col("team_season_key").alias("opponent_team_season_key"),
            F.col("rolling_points").alias("opponent_rolling_points"),
            F.col("rolling_team_expected_goals").alias("opponent_rolling_team_expected_goals"),
            F.col("rolling_expected_goals_against").alias("opponent_rolling_expected_goals_against"),
            F.col("rolling_goal_difference").alias("opponent_rolling_goal_difference"),
            F.col("avg_team_expected_goals").alias("opponent_avg_team_expected_goals"),
            F.col("avg_team_expected_assists").alias("opponent_avg_team_expected_assists"),
            F.col("avg_team_expected_goal_involvements").alias("opponent_avg_team_expected_goal_involvements"),
            F.col("avg_expected_goals_against").alias("opponent_avg_expected_goals_against"),
            F.col("avg_expected_assists_against").alias("opponent_avg_expected_assists_against"),
            F.col("avg_expected_goal_involvements_against").alias("opponent_avg_expected_goal_involvements_against"),
            F.col("avg_goal_difference").alias("opponent_avg_goal_difference")
        )

        player_inference_features_df = players_gw_fixtures_df.join(
            player_current_features_df,
            on="player_surrogate_key",
            how="inner"
        ).join(
            opponent_team_features_df,
            on="opponent_team_season_key",
            how="left"
        )

        for col in feature_cols:
            if col not in player_inference_features_df.columns:
                player_inference_features_df = player_inference_features_df.withColumn(col, F.lit(-1))

        #display(player_inference_features_df.filter(F.col("player_surrogate_key") == 1986137032))

        player_inference_features_df = player_inference_features_df.fillna(
            -1, 
            subset=feature_cols
        ).withColumn(
            "player_fixture_key",F.concat(F.col("player_surrogate_key"), F.col("fixture_key")).cast("long")
        ).select(
            "player_fixture_key", 
            "player_surrogate_key", 
            "fixture_key", 
            *feature_cols
        )

        player_inference_pdf = player_inference_features_df.toPandas()
        player_inference_pdf["predicted_total_points"] = sklearn_model.predict(player_inference_pdf[feature_cols])

        player_inference_spark_df = spark.createDataFrame(player_inference_pdf).select(
            "player_fixture_key",
            "player_surrogate_key",
            "fixture_key",
            "predicted_total_points"
        )

        print("merging: ", gw)

        merge_to_table(
            df = player_inference_spark_df,
            table_name = f"{feature_schema}.player_points_inference",
            merge_condition = "source.player_fixture_key = target.player_fixture_key",
            spark = spark
        )

merging:  20252602
merging:  20252603
merging:  20252604
merging:  20252605
merging:  20252606
merging:  20252607
merging:  20252608
merging:  20252609
merging:  20252610
merging:  20252611
merging:  20252612
merging:  20252613
merging:  20252614
merging:  20252615
merging:  20252616
merging:  20252617
merging:  20252618
merging:  20252619
merging:  20252620
merging:  20252621
