In [35]:
# =============================================================================
# SETUP
# =============================================================================
import sys
from pathlib import Path
import pandas as pd
import logging
import numpy as np

# Add project root to Python path
sys.path.insert(0, '/Users/architmanek/Desktop/DataEngineering/football_pipeline')

# Now imports work without changing working directory
from utils.config import ABS_PATH, BRONZE_DIR_EVENTS

file_path = ABS_PATH / BRONZE_DIR_EVENTS / "events_15946.parquet"
df = pd.read_parquet(file_path)

In [36]:
# =============================================================================
# TEST 1: COLUMN FLATTENING
# =============================================================================
def test_flatten_columns(df):
    df.columns = [col.replace('.', '_') for col in df.columns]
    return df

In [37]:
# Find columns with periods
columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")

# Test flatten_columns
df = test_flatten_columns(df)

columns_with_periods = [col for col in df.columns if '.' in col]
print(f"Columns with periods: {columns_with_periods}")
print(f"Number of columns with periods: {len(columns_with_periods)}")


Columns with periods: ['type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'pass.switch', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'duel.type.id', 'duel.type.name', 'pass.aerial_won', 'interception.outcome.id', 'interception.outcome.name', 'ball_recovery.recovery_failure', 'pass.assisted_shot_id', 'pass.shot_assist', 'shot.statsbomb_xg', 'shot.end_location', 'shot.key_pass_id', 'shot.outcome.id', 'shot.outcome.name', 'shot.first_time', 'shot.technique.id', 'shot.technique.name', 'shot.body_part.id', 'shot.body_part.name', 'sho

In [38]:
# =============================================================================
# TEST 2: ENRICH LOCATIONS
# =============================================================================
def normalize_location(location, x_max=120, y_max=80):
    logging.debug(f"Normalizing location: {location}")
    try:
        if location is None:
            logging.debug(f"Location is None: {location}")
            return [None, None]
        
        # Handle numpy arrays properly
        if isinstance(location, np.ndarray):
            if location.size == 0 or np.any(pd.isna(location)):
                logging.debug(f"Location array is empty or contains NaN: {location}")
                return [None, None]
        
        if isinstance(location, (list, tuple, np.ndarray)) and len(location) == 2:
            x, y = location
            result = [x / x_max, y / y_max]
            logging.debug(f"Normalized location: {result}")
            return result
        else:
            logging.debug(f"Invalid location format: {location}, type: {type(location)}")
            return [None, None]
    except Exception as e:
        logging.debug(f"Error normalizing location {location}: {e}")
        return [None, None]

def normalize_end_location(location, x_max=120, y_max=80):
    logging.debug(f"Normalizing end location: {location}")
    try:
        if location is None:
            logging.debug(f"End location is None: {location}")
            return [None, None]
        
        # Handle numpy arrays properly
        if isinstance(location, np.ndarray):
            if location.size == 0 or np.any(pd.isna(location)):
                logging.debug(f"End location array is empty or contains NaN: {location}")
                return [None, None]
        
        if isinstance(location, (list, tuple, np.ndarray)) and len(location) == 2:
            x, y = location
            result = [x / x_max, y / y_max]
            logging.debug(f"Normalized end location: {result}")
            return result
        else:
            logging.debug(f"Invalid end location format: {location}, type: {type(location)}")
            return [None, None]
    except Exception as e:
        logging.debug(f"Error normalizing end location {location}: {e}")
        return [None, None]

def enrich_locations(df):
    """Enrich locations

    Args:
        df (pd.DataFrame): DataFrame with pass data

    Returns:
        pd.DataFrame: DataFrame with pass features. Added so far:
        - x: Normalized x coordinate
        - y: Normalized y coordinate
    """
    # Initialize coordinate columns
    df["x"] = None
    df["y"] = None
    df["end_x"] = None
    df["end_y"] = None
    
    # Process start locations
    for idx in df.index:
        location = df.at[idx, "location"]
        if location is not None and not (isinstance(location, (list, tuple, np.ndarray)) and len(location) == 0):
            normalized = normalize_location(location)
            if normalized != [None, None]:
                df.at[idx, "x"] = normalized[0]
                df.at[idx, "y"] = normalized[1]
    
    # Process end locations for passes
    is_pass = (df["type_name"] == "Pass") & df["pass_end_location"].notnull()
    for idx in df[is_pass].index:
        end_location = df.at[idx, "pass_end_location"]
        if end_location is not None and not (isinstance(end_location, (list, tuple, np.ndarray)) and len(end_location) == 0):
            normalized = normalize_end_location(end_location)
            if normalized != [None, None]:
                df.at[idx, "end_x"] = normalized[0]
                df.at[idx, "end_y"] = normalized[1]
    
    return df

In [39]:
# Test enrich_locations (which tests normalize functions internally)
print("Before enriching:")
print(f"Has x column: {'x' in df.columns}")
print(f"Has y column: {'y' in df.columns}")

df_enriched = enrich_locations(df.copy())

print("\nAfter enriching:")
print(f"Events with x,y coordinates: {df_enriched[['x', 'y']].notnull().all(axis=1).sum()}")
print(f"Pass events with end_x,end_y: {df_enriched[['end_x', 'end_y']].notnull().all(axis=1).sum()}")

# Sample some results
sample_coords = df_enriched[df_enriched.x.notnull()][['location', 'x', 'y', 'end_x', 'end_y']].head(3)
print(f"\nSample normalized coordinates:")
print(sample_coords)


Before enriching:
Has x column: False
Has y column: False

After enriching:
Events with x,y coordinates: 3741
Pass events with end_x,end_y: 1163

Sample normalized coordinates:
       location         x        y     end_x end_y
4  [61.0, 40.1]  0.508333  0.50125  0.281667  0.35
5  [33.8, 28.0]  0.281667     0.35      None  None
6  [33.8, 28.0]  0.281667     0.35      None  None


In [None]:
# =============================================================================
# TEST 4: POSSSESSION STATS
# =============================================================================

def add_possession_stats(df):
    # Calculate event count per possession
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S.%f')
    event_count = df.groupby('possession').size().rename('possession_event_count')

    # Number of passes in each possession
    possession_pass_count = df[df['type_name'] == 'Pass'].groupby('possession').size().rename('possession_pass_count')

    # Number of unique players in each possession
    possession_player_count = df.groupby('possession')['player_id'].nunique().rename('possession_player_count')

    # Calculate possession duration
    possession_duration = df.groupby('possession')['timestamp'].max() - df.groupby('possession')['timestamp'].min()
    possession_duration = possession_duration.dt.total_seconds().rename('possession_duration')

    # Merge it back into the main DataFrame as a new column
    df = df.merge(event_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_pass_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_player_count, left_on='possession', right_index=True, how='left')
    df = df.merge(possession_duration, left_on='possession', right_index=True, how='left')

    # Now you can easily inspect the new column
    # print(df[['possession', 'possession_event_count', 'possession_pass_count', 'possession_player_count']].head(10))
    # print(df['possession_event_count'].value_counts())
    # print(df['possession_pass_count'].value_counts())
    # print(df['possession_player_count'].value_counts())
    print(df['possession_duration'].value_counts())

    return df

In [47]:
df = add_possession_stats(df_enriched)
# print(f"\nDataFrame after adding possession stats:\n{df.head()}")

possession_duration
0 days 00:01:53.864000    137
0 days 00:01:51.634000    123
0 days 00:01:29.255000    107
0 days 00:01:18.274000    105
0 days 00:01:31.113000    104
                         ... 
0 days 00:00:03.359000      2
0 days 00:00:22.994000      2
0 days 00:00:02.025000      2
0 days 00:00:03.287000      2
0 days 00:00:06.604000      2
Name: count, Length: 135, dtype: int64
