In [1]:
# =============================================================================
# SETUP
# =============================================================================
import sys
from pathlib import Path
import polars as pl
import logging
import numpy as np

# Add project root to Python path
sys.path.insert(0, '/Users/architmanek/Desktop/DataEngineering/football_pipeline')

# Now imports work without changing working directory
from utils.config import ABS_PATH, BRONZE_DIR_EVENTS

file_path = ABS_PATH / BRONZE_DIR_EVENTS / "events_15946.parquet"
df = pl.read_parquet(file_path)

In [2]:
# =============================================================================
# TEST 1: COLUMN FLATTENING
# =============================================================================
def test_flatten_columns(df):
    df.columns = [col.replace('.', '_') for col in df.columns]
    return df

In [3]:
# Find columns with periods
columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")

# Test flatten_columns
df = test_flatten_columns(df)

columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")


Columns with periods: ['type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'pass.switch', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'duel.type.id', 'duel.type.name', 'pass.aerial_won', 'interception.outcome.id', 'interception.outcome.name', 'ball_recovery.recovery_failure', 'pass.assisted_shot_id', 'pass.shot_assist', 'shot.statsbomb_xg', 'shot.end_location', 'shot.key_pass_id', 'shot.outcome.id', 'shot.outcome.name', 'shot.first_time', 'shot.technique.id', 'shot.technique.name', 'shot.body_part.id', 'shot.body_part.name', 'sho

In [8]:
# =============================================================================
# TEST 2: ENRICH LOCATIONS
# =============================================================================
def enrich_locations(df):
    df = df.with_columns([
        pl.col("location").cast(pl.List(pl.Float64)).alias("location"),
        pl.col("pass_end_location").cast(pl.List(pl.Float64)).alias("pass_end_location"),
    ])
    df = df.with_columns([
        (pl.col("location").arr.get(0) / 120).alias("x"),
        (pl.col("location").arr.get(1) / 80).alias("y"),
        pl.when(pl.col("type_name") == "Pass")
          .then(pl.col("pass_end_location").arr.get(0) / 120)
          .otherwise(None)
          .alias("end_x"),
        pl.when(pl.col("type_name") == "Pass")
          .then(pl.col("pass_end_location").arr.get(1) / 80)
          .otherwise(None)
          .alias("end_y"),
    ])
    return df

In [9]:
# Test enrich_locations (which tests normalize functions internally)
# print("Before enriching:")
# print(f"Has x column: {'x' in df.columns}")
# print(f"Has y column: {'y' in df.columns}")

df_enriched = enrich_locations(df)
# df['location'].dtype

# print("\nAfter enriching:")
# print(f"Events with x,y coordinates: {df_enriched[['x', 'y']].notnull().all(axis=1).sum()}")
# print(f"Pass events with end_x,end_y: {df_enriched[['end_x', 'end_y']].notnull().all(axis=1).sum()}")

# # Sample some results
# sample_coords = df_enriched[df_enriched.x.notnull()][['location', 'x', 'y', 'end_x', 'end_y']].head(3)
# print(f"\nSample normalized coordinates:")
# print(sample_coords)

print(df_enriched[["location", "x", "y"]].head())


SchemaError: invalid series dtype: expected `Array`, got `list[f64]` for series with name `location`

In [None]:
# =============================================================================
# TEST 4: POSSSESSION STATS
# =============================================================================

def add_possession_stats(df):
    # Calculate event count per possession
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S.%f')
    event_count = df.groupby('possession').size().rename('possession_event_count')

    # Number of passes in each possession
    possession_pass_count = df[df['type_name'] == 'Pass'].groupby('possession').size().rename('possession_pass_count')

    # Number of unique players in each possession
    possession_player_count = df.groupby('possession')['player_id'].nunique().rename('possession_player_count')

    # Calculate possession duration
    possession_duration = df.groupby('possession')['timestamp'].max() - df.groupby('possession')['timestamp'].min()
    possession_duration = possession_duration.dt.total_seconds().rename('possession_duration')

    # Total xG in the possession
    total_xg = df[df['type_name'] == "Shot"].groupby('possession')['shot_statsbomb_xg'].sum().rename('total_xG')

    # Merge it back into the main DataFrame as a new column
    df = df.merge(event_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_pass_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_player_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_duration, left_on='possession', right_index=True, how='left')
    df = df.merge(total_xg, left_on='possession', right_index=True, how='left')

    return df

In [None]:
# df = add_possession_stats(df_enriched)
print(df_enriched.select(["location", "x_raw", "y_raw"]).head())# print([col for col in df.columns if "duration" in col])
# print(f"\nDataFrame after adding possession stats:\n{df.head()}")

AttributeError: 'DataFrame' object has no attribute 'select'