In [12]:
import pandas as pd
goals = pd.read_csv("../../data-csv/goals.csv")
goals = goals.copy()

In [13]:
# ------------------------------------------------------
# 2. Filter tournaments
# ------------------------------------------------------
valid_tournaments = [
    "WC-2002", "WC-2006", "WC-2010",
    "WC-2014", "WC-2018", "WC-2022"
]

goals = goals[goals["tournament_id"].isin(valid_tournaments)].reset_index(drop=True)

# Drop columns
goals = goals.drop(
    columns=["tournament_name", "group_name"],
    errors="ignore"
)

In [14]:
# ------------------------------------------------------
# 3. Basic column cleanup
# ------------------------------------------------------
# Convert match_date to datetime
goals["match_date"] = pd.to_datetime(
    goals["match_date"],
    errors="coerce"
)

# Ensure booleans are booleans
for col in ["own_goal", "penalty"]:
    if col in goals.columns:
        goals[col] = goals[col].astype("boolean")

In [15]:
# ------------------------------------------------------
# 4. Minute handling
# ------------------------------------------------------
# Keep minute_regulation and minute_stoppage numeric
goals["minute_regulation"] = pd.to_numeric(goals["minute_regulation"], errors="coerce")
goals["minute_stoppage"] = pd.to_numeric(goals["minute_stoppage"], errors="coerce")

# Create a total minute variable
goals["minute_total"] = (
    goals["minute_regulation"].fillna(0)
    + goals["minute_stoppage"].fillna(0)
)

In [16]:
# ------------------------------------------------------
# 5. Team name harmonisation
# ------------------------------------------------------
def clean_team_name(x):
    if not isinstance(x, str):
        return x

    x = x.strip().title()

    if x == "Iran":
        return "IR Iran"

    if x == "South Korea":
        return "Korea Republic"

    return x
    
for col in ["team_name", "home_team", "away_team", "player_team_name"]:
    if col in goals.columns:
        goals[col] = goals[col].apply(clean_team_name)

In [17]:
print("Rows after filtering:", len(goals))
print("Columns:", goals.columns.tolist())

Rows after filtering: 965
Columns: ['key_id', 'goal_id', 'tournament_id', 'match_id', 'match_name', 'match_date', 'stage_name', 'team_id', 'team_name', 'team_code', 'home_team', 'away_team', 'player_id', 'family_name', 'given_name', 'shirt_number', 'player_team_id', 'player_team_name', 'player_team_code', 'minute_label', 'minute_regulation', 'minute_stoppage', 'match_period', 'own_goal', 'penalty', 'minute_total']


In [18]:
goals.to_csv("../../data-processed/goals_clean.csv", index=False)
print("Saved → ../../data-processed/goals_clean.csv")

Saved → ../../data-processed/goals_clean.csv
