In [25]:
import pandas as pd
bookings = pd.read_csv("../../data-csv/bookings.csv")
bookings = bookings.copy()

In [26]:
# ------------------------------------------------------
# 2. Filter tournaments
# ------------------------------------------------------
valid_tournaments = [
    "WC-2002", "WC-2006", "WC-2010",
    "WC-2014", "WC-2018", "WC-2022"
]

bookings = bookings[bookings["tournament_id"].isin(valid_tournaments)].reset_index(drop=True)

In [27]:
# ------------------------------------------------------
# 3. Parse match_date
# ------------------------------------------------------
bookings["match_date"] = pd.to_datetime(
    bookings["match_date"],
    errors="coerce"
)

In [28]:
# ------------------------------------------------------
# 4. Clean team names
# ------------------------------------------------------
def clean_team_name(x):
    if not isinstance(x, str):
        return x

    x = x.strip().title()

    if x == "Iran":
        return "IR Iran"

    if x == "South Korea":
        return "Korea Republic"

    return x
    
for col in ["team_name", "home_team", "away_team"]:
    if col in bookings.columns:
        bookings[col] = bookings[col].apply(clean_team_name)

In [29]:
# ------------------------------------------------------
# 5. Create single card-minute variable
# ------------------------------------------------------
bookings["minute_stoppage"] = bookings["minute_stoppage"].fillna(0)

bookings["card_minute"] = (
    bookings["minute_regulation"] + bookings["minute_stoppage"]
)

In [30]:
# ------------------------------------------------------
# 6. Handle second yellow → red logic
# ------------------------------------------------------
# A second yellow card implies a sending off
bookings["is_yellow"] = (
    (bookings["yellow_card"] == 1) |
    (bookings["second_yellow_card"] == 1)
)

bookings["is_red"] = (
    (bookings["red_card"] == 1) |
    (bookings["second_yellow_card"] == 1) |
    (bookings["sending_off"] == 1)
)

# Convert to boolean (cleaner semantics)
bookings["is_yellow"] = bookings["is_yellow"].astype(bool)
bookings["is_red"] = bookings["is_red"].astype(bool)

In [31]:
# ------------------------------------------------------
# 7. Keep only relevant columns
# ------------------------------------------------------
bookings = bookings[
    [
        "tournament_id",
        "match_id",
        "match_date",
        "stage_name",
        "match_period",
        "team_id",
        "team_name",
        "home_team",
        "away_team",
        "card_minute",
        "is_yellow",
        "is_red"
    ]
]

In [32]:
# Sanity check
print(bookings.head())
print(bookings.info())

  tournament_id   match_id match_date   stage_name               match_period  \
0       WC-2002  M-2002-01 2002-05-31  group stage  first half, stoppage time   
1       WC-2002  M-2002-01 2002-05-31  group stage                second half   
2       WC-2002  M-2002-02 2002-06-01  group stage                 first half   
3       WC-2002  M-2002-02 2002-06-01  group stage                second half   
4       WC-2002  M-2002-02 2002-06-01  group stage                second half   

  team_id            team_name  home_team  away_team  card_minute  is_yellow  \
0    T-30               France          1          0           47       True   
1    T-65              Senegal          0          1           51       True   
2    T-60  Republic Of Ireland          1          0           30       True   
3    T-60  Republic Of Ireland          1          0           51       True   
4    T-60  Republic Of Ireland          1          0           82       True   

   is_red  
0   False  
1   Fals

In [33]:
bookings.to_csv("../../data-processed/bookings_clean.csv", index=False)
print("Saved → ../../data-processed/goals_clean.csv")

Saved → ../../data-processed/goals_clean.csv
