In [28]:
import pandas as pd
import numpy as np
substitutions = pd.read_csv("../../data-csv/substitutions.csv")
substitutions = substitutions.copy()

In [29]:
# ------------------------------------------------------
# 2. Filter tournaments
# ------------------------------------------------------
valid_tournaments = [
    "WC-2002", "WC-2006", "WC-2010",
    "WC-2014", "WC-2018", "WC-2022"
]
substitutions = substitutions[substitutions["tournament_id"].isin(valid_tournaments)].reset_index(drop=True)

In [30]:
# ------------------------------------------------------
# 3. Column cleanup
# ------------------------------------------------------
# Convert match_date to datetime
substitutions["match_date"] = pd.to_datetime(
    substitutions["match_date"],
    errors="coerce"
)

# Ensure minute columns are numeric
substitutions["minute_regulation"] = pd.to_numeric(substitutions["minute_regulation"], errors="coerce")
substitutions["minute_stoppage"] = pd.to_numeric(substitutions["minute_stoppage"], errors="coerce")

# Create total minute variable
substitutions["minute_total"] = (
    substitutions["minute_regulation"].fillna(0)
    + substitutions["minute_stoppage"].fillna(0)
)

In [31]:
# ------------------------------------------------------
# 4. Team name consistency
# ------------------------------------------------------
def clean_team_name(x):
    if not isinstance(x, str):
        return x
    x = x.strip().title()
    if x == "Iran":
        return "IR Iran"
    if x == "South Korea":
        return "Korea Republic"
    return x

for col in ["team_name"]:
    if col in substitutions.columns and substitutions[col].dtype == "object":
        substitutions[col] = substitutions[col].apply(clean_team_name)

In [34]:
# ------------------------------------------------------
# 5. Handle double-row structure
# ------------------------------------------------------
# The raw data has 2 rows per substitution: one for "going_off" and one for "coming_on"
# Split and merge them to create single row per substitution event

# Split into two df
subs_out = substitutions[substitutions["going_off"] == 1].copy()
subs_in = substitutions[substitutions["coming_on"] == 1].copy()

# Sort by substitution_id to preserve order of events
subs_out = subs_out.sort_values("substitution_id")
subs_in = subs_in.sort_values("substitution_id")

# Create a rank to handle multiple substitutions occurring at the exact same minute/team
# (To ensure 1st 'Out' pairs with the 1st 'In', the 2nd with the 2nd, etc.)
group_cols = ["match_id", "team_id", "minute_regulation", "minute_stoppage", "match_period"]
subs_out["sub_rank"] = subs_out.groupby(group_cols).cumcount()
subs_in["sub_rank"] = subs_in.groupby(group_cols).cumcount()

# Select columns to keep for the merge (avoiding duplicate columns)
# Shared columns (metadata)
shared_cols = [
    "tournament_id", "match_id", "match_date", "stage_name", 
    "team_id", "team_name", "team_code", "home_team", "away_team",
    "minute_label", "minute_regulation", "minute_stoppage", 
    "minute_total", "match_period"
]

# Columns specific to the player
player_cols = ["player_id", "family_name", "given_name", "shirt_number"]

# Merge
substitutions_clean = pd.merge(
    subs_in[shared_cols + player_cols + ["sub_rank"]],
    subs_out[player_cols + group_cols + ["sub_rank"]],
    on=group_cols + ["sub_rank"],
    suffixes=("_in", "_out"),
    how="inner"
)

# De-duplicate shared columns if duplicated by the merge key logic
substitutions_clean = substitutions_clean.loc[:, ~substitutions_clean.columns.duplicated()]

In [35]:
# Drop the helper rank column
substitutions_clean = substitutions_clean.drop(columns=["sub_rank"], errors="ignore")

substitutions_clean.to_csv("../../data-processed/substitutions_clean.csv", index=False)
print("Saved → ../../data-processed/substitutions_clean.csv")

Saved → ../../data-processed/substitutions_clean.csv
