In [21]:
# =============================================================================
# SETUP
# =============================================================================
import sys
from pathlib import Path
import polars as pl
import logging
import numpy as np

# Add project root to Python path
sys.path.insert(0, '/Users/architmanek/Desktop/DataEngineering/football_pipeline')

# Now imports work without changing working directory
from utils.constants import ABS_PATH, BRONZE_DIR_EVENTS

file_path = ABS_PATH / BRONZE_DIR_EVENTS / "events_15946.parquet"
df = pl.read_parquet(file_path)
df = df.with_columns(
                pl.col("timestamp").str.strptime(pl.Datetime, "%H:%M:%S.%f", strict=False)
            )

  pl.col("timestamp").str.strptime(pl.Datetime, "%H:%M:%S.%f", strict=False)


In [22]:
# =============================================================================
# TEST 1: COLUMN FLATTENING
# =============================================================================
def flatten_columns(df):
    while True:
        # Find struct columns
        struct_cols = [col for col in df.columns if df.schema[col] == pl.Struct]
        # Find list-of-struct columns
        list_struct_cols = [
            col for col in df.columns
            if isinstance(df.schema[col], pl.List) and getattr(df.schema[col], 'inner', None) == pl.Struct
        ]
        if not struct_cols and not list_struct_cols:
            break

        # Flatten struct columns
        for col in struct_cols:
            fields = df.schema[col].fields
            df = df.with_columns([
                pl.col(col).struct.field(field.name).alias(f"{col}_{field.name}") for field in fields
            ]).drop(col)

        # Explode and flatten list-of-struct columns
        for col in list_struct_cols:
            df = df.explode(col)
            # After exploding, the column is now a struct, so flatten it
            fields = df.schema[col].fields
            df = df.with_columns([
                pl.col(col).struct.field(field.name).alias(f"{col}_{field.name}") for field in fields
            ]).drop(col)
        df = df.rename({col: col.replace('.', '_') for col in df.columns})
    return df

In [23]:
# Find columns with periods
columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")

# Test flatten_columns
df = flatten_columns(df)

columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")


Columns with periods: ['type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'pass.switch', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'duel.type.id', 'duel.type.name', 'pass.aerial_won', 'interception.outcome.id', 'interception.outcome.name', 'ball_recovery.recovery_failure', 'pass.assisted_shot_id', 'pass.shot_assist', 'shot.statsbomb_xg', 'shot.end_location', 'shot.key_pass_id', 'shot.outcome.id', 'shot.outcome.name', 'shot.first_time', 'shot.technique.id', 'shot.technique.name', 'shot.body_part.id', 'shot.body_part.name', 'sho

In [24]:
# =============================================================================
# TEST 2: ENRICH LOCATIONS
# =============================================================================
def enrich_locations(df):
    df = df.with_columns([
        pl.col("location").cast(pl.Array(pl.Float64, 2)).alias("location"),
        pl.col("pass_end_location").cast(pl.Array(pl.Float64, 2)).alias("pass_end_location"),
    ])
    df = df.with_columns([
        (pl.col("location").arr.get(0) / 120).alias("x"),
        (pl.col("location").arr.get(1) / 80).alias("y"),
        pl.when(pl.col("type_name") == "Pass")
          .then(pl.col("pass_end_location").arr.get(0) / 120)
          .otherwise(None)
          .alias("end_x"),
        pl.when(pl.col("type_name") == "Pass")
          .then(pl.col("pass_end_location").arr.get(1) / 80)
          .otherwise(None)
          .alias("end_y"),
    ])
    return df

In [25]:
# Test enrich_locations (which tests normalize functions internally)
# print("Before enriching:")
# print(f"Has x column: {'x' in df.columns}")
# print(f"Has y column: {'y' in df.columns}")

df_enriched = enrich_locations(df)
# df['location'].dtype

# print("\nAfter enriching:")
# print(f"Events with x,y coordinates: {df_enriched[['x', 'y']].notnull().all(axis=1).sum()}")
# print(f"Pass events with end_x,end_y: {df_enriched[['end_x', 'end_y']].notnull().all(axis=1).sum()}")

# # Sample some results
# sample_coords = df_enriched[df_enriched.x.notnull()][['location', 'x', 'y', 'end_x', 'end_y']].head(3)
# print(f"\nSample normalized coordinates:")
# print(sample_coords)



df_enriched.head(5)


id,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,related_events,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,carry_end_location,pass_switch,pass_outcome_id,…,shot_aerial_won,miscontrol_aerial_won,dribble_overrun,pass_miscommunication,block_offensive,bad_behaviour_card_id,bad_behaviour_card_name,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,pass_cut_back,shot_one_on_one,foul_committed_advantage,foul_won_advantage,clearance_aerial_won,pass_deflected,pass_no_touch,foul_committed_type_id,foul_committed_type_name,pass_straight,pass_goal_assist,tactics_lineup_jersey_number,shot_freeze_frame_location,shot_freeze_frame_teammate,tactics_lineup_player_id,tactics_lineup_player_name,tactics_lineup_position_id,tactics_lineup_position_name,shot_freeze_frame_player_id,shot_freeze_frame_player_name,shot_freeze_frame_position_id,shot_freeze_frame_position_name,x,y,end_x,end_y
str,i64,i64,datetime[μs],i64,i64,i64,f64,i64,str,i64,str,i64,str,i64,str,f64,list[str],"array[f64, 2]",f64,str,f64,str,f64,str,f64,f64,f64,str,"array[f64, 2]",f64,str,f64,str,list[f64],bool,f64,…,bool,bool,bool,bool,bool,f64,str,f64,str,f64,str,bool,bool,bool,bool,bool,bool,bool,f64,str,bool,bool,i64,list[f64],bool,i64,str,i64,str,i64,str,i64,str,f64,f64,f64,f64
"""9f6e2ecf-6685-45df-a62e-c2db30…",1,1,,0,0,1,0.0,35,"""Starting XI""",217,"""Barcelona""",1,"""Regular Play""",217,"""Barcelona""",442.0,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,1,,,20055,"""Marc-André ter Stegen""",1,"""Goalkeeper""",,,,,,,,
"""9f6e2ecf-6685-45df-a62e-c2db30…",1,1,,0,0,1,0.0,35,"""Starting XI""",217,"""Barcelona""",1,"""Regular Play""",217,"""Barcelona""",442.0,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,2,,,6374,"""Nélson Cabral Semedo""",2,"""Right Back""",,,,,,,,
"""9f6e2ecf-6685-45df-a62e-c2db30…",1,1,,0,0,1,0.0,35,"""Starting XI""",217,"""Barcelona""",1,"""Regular Play""",217,"""Barcelona""",442.0,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,3,,,5213,"""Gerard Piqué Bernabéu""",3,"""Right Center Back""",,,,,,,,
"""9f6e2ecf-6685-45df-a62e-c2db30…",1,1,,0,0,1,0.0,35,"""Starting XI""",217,"""Barcelona""",1,"""Regular Play""",217,"""Barcelona""",442.0,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,23,,,5492,"""Samuel Yves Umtiti""",5,"""Left Center Back""",,,,,,,,
"""9f6e2ecf-6685-45df-a62e-c2db30…",1,1,,0,0,1,0.0,35,"""Starting XI""",217,"""Barcelona""",1,"""Regular Play""",217,"""Barcelona""",442.0,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,18,,,5211,"""Jordi Alba Ramos""",6,"""Left Back""",,,,,,,,
