In [0]:

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler,
    StandardScaler
)

storage_account = "playersanalytics60300294"
container       = "lakehouse"

base_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net"

spark.conf.set(
"fs.azure.account.key.playersanalytics60300294.dfs.core.windows.net",
"bKSw/Bor6nERyhVn7ZidKwHfMf1jASd77r5MRsWT4t+b++uoSOEfT9tEh33BUFbaCk2rFCWR7Hi6+ASt05hgSA=="
)

print("Config loaded.")


Config loaded.


In [0]:
df = spark.table("football_lakehouse.player_season_value_features")

print("Loaded Gold Features:")
df.printSchema()

df.show(5, truncate=False)


Loaded Gold Features:
root
 |-- player_id: integer (nullable = true)
 |-- season: integer (nullable = true)
 |-- player_name: string (nullable = true)
 |-- club_id: integer (nullable = true)
 |-- matches_played: long (nullable = true)
 |-- total_minutes: long (nullable = true)
 |-- total_goals: long (nullable = true)
 |-- total_assists: long (nullable = true)
 |-- yellow_cards: long (nullable = true)
 |-- red_cards: long (nullable = true)
 |-- goals_per90: double (nullable = true)
 |-- assists_per90: double (nullable = true)
 |-- cards_per90: double (nullable = true)
 |-- name: string (nullable = true)
 |-- position: string (nullable = true)
 |-- sub_position: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- current_club_id: string (nullable = true)
 |-- age_at_season_start: integer (nullable = true)
 |-- club_name: string (nullable = true)
 |-- season_market_value_eur: long (nullable = true)

+---------+------+------------------+-------+--------------+---------

In [0]:
df = (
    df.fillna({"position": "Unknown"})
      .fillna({"sub_position": "Unknown"})
      .fillna(0)
)

print("Missing values handled.")


Missing values handled.


In [0]:
df = (
    df
    .withColumn("goal_contribution_per90",
                F.col("goals_per90") + F.col("assists_per90"))
    .withColumn("goals_per_match",
                F.col("total_goals") / F.col("matches_played"))
    .withColumn("assists_per_match",
                F.col("total_assists") / F.col("matches_played"))
    .withColumn("cards_per_match",
                (F.col("yellow_cards") + F.col("red_cards")) / F.col("matches_played"))
    .withColumn("minutes_per_match",
                F.col("total_minutes") / F.col("matches_played"))
    .withColumn("discipline_index",
                (F.col("yellow_cards") + 2 * F.col("red_cards")) / F.col("matches_played"))
)

print("Basic performance features added.")


Basic performance features added.


In [0]:
df = (
    df
    .withColumn("age_squared", F.col("age_at_season_start")**2)
    .withColumn("is_under_21", F.col("age_at_season_start") < 21)
    .withColumn("is_over_30", F.col("age_at_season_start") > 30)
)

print("Age features added.")


Age features added.


In [0]:
df = (
    df
    .withColumn("market_value_millions",
                F.col("season_market_value_eur") / 1e6)
    .withColumn("log_market_value",
                F.log(F.col("season_market_value_eur") + 1))
)

print("Market value transformations complete.")


Market value transformations complete.


In [0]:
# Index categorical columns
pos_indexer = StringIndexer(
    inputCol="position",
    outputCol="position_index",
    handleInvalid="keep"
).fit(df)

df = pos_indexer.transform(df)

sub_indexer = StringIndexer(
    inputCol="sub_position",
    outputCol="sub_position_index",
    handleInvalid="keep"
).fit(df)

df = sub_indexer.transform(df)

# One-hot encode
encoder = OneHotEncoder(
    inputCols=["position_index", "sub_position_index"],
    outputCols=["position_ohe", "sub_position_ohe"]
)

df = encoder.fit(df).transform(df)

print("Categorical encoding complete.")


Categorical encoding complete.


In [0]:
numeric_cols = [
    "goals_per90", "assists_per90", "cards_per90",
    "goal_contribution_per90",
    "goals_per_match", "assists_per_match", "cards_per_match",
    "minutes_per_match", "discipline_index",
    "age_at_season_start", "age_squared",
    "market_value_millions"
]

assembler = VectorAssembler(
    inputCols=numeric_cols,
    outputCol="numeric_vector"
)

df = assembler.transform(df)

scaler = StandardScaler(
    inputCol="numeric_vector",
    outputCol="numeric_scaled"
).fit(df)

df = scaler.transform(df)

print("Numeric scaling complete.")


Numeric scaling complete.


In [0]:
df.select(
    "player_id", "player_name", "season",
    "goal_contribution_per90",
    "goals_per_match", "assists_per_match",
    "market_value_millions", "log_market_value",
    "position_ohe"
).show(10, truncate=False)


+---------+-------------------+------+-----------------------+--------------------+--------------------+---------------------+------------------+-------------+
|player_id|player_name        |season|goal_contribution_per90|goals_per_match     |assists_per_match   |market_value_millions|log_market_value  |position_ohe |
+---------+-------------------+------+-----------------------+--------------------+--------------------+---------------------+------------------+-------------+
|92567    |Andriy Bogdanov    |2012  |0.33682634730538924    |0.14285714285714285 |0.09523809523809523 |2.0                  |14.508658238524095|(5,[1],[1.0])|
|89222    |Yaroslav Rakitskyi |2012  |0.14691478942213515    |0.02857142857142857 |0.11428571428571428 |12.0                 |16.300417291085605|(5,[0],[1.0])|
|3182     |Ashley Cole        |2012  |0.1279620853080569     |0.020833333333333332|0.10416666666666667 |8.0                  |15.894952224644102|(5,[0],[1.0])|
|45596    |Márcio Mossoró     |2012  |0.

In [0]:
output_path = f"{base_path}/gold/player_season_features/"

# Allow schema overwrite
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

(
    df.write
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .save(output_path)
)

df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .saveAsTable("football_lakehouse.player_season_features")

print("FINAL FEATURE TABLE SAVED SUCCESSFULLY.")


FINAL FEATURE TABLE SAVED SUCCESSFULLY.


In [0]:
for pos in df.select("position").distinct().collect():
    p = pos["position"]
    print("Position:", p)
    df.filter(F.col("position") == p).select(
        corr("goal_contribution_per90", "season_market_value_eur")
    ).show()


Position: Defender
+------------------------------------------------------+
|corr(goal_contribution_per90, season_market_value_eur)|
+------------------------------------------------------+
|                                   0.02200527322560461|
+------------------------------------------------------+

Position: Midfield
+------------------------------------------------------+
|corr(goal_contribution_per90, season_market_value_eur)|
+------------------------------------------------------+
|                                   0.09278157504134708|
+------------------------------------------------------+

Position: Missing
+------------------------------------------------------+
|corr(goal_contribution_per90, season_market_value_eur)|
+------------------------------------------------------+
|                                   0.30192051986941015|
+------------------------------------------------------+

Position: Goalkeeper
+------------------------------------------------------+
|corr(go