In [23]:
import pandas as pd

# ------------------------------------------------------
# 1. Load datasets
# ------------------------------------------------------
box = pd.read_csv("../../data-csv/FIFAallMatchBoxData.csv")
matches_2022 = pd.read_csv("../../data-csv/Fifa_world_cup_matches.csv")
possession_2022 = pd.read_csv("../../data-csv/data.csv")

In [24]:
# ------------------------------------------------------
# 2. Clean 2002–2018 box data
# ------------------------------------------------------
box = box.copy()

# Fix misspelled possession columns
box = box.rename(columns={
    "hPossesion": "hPossession",
    "aPossesion": "aPossession"
})

# Create tournament_id from year
box["tournament_id"] = "WC-" + box["year"].astype(str)

# Drop superfluous columns
box = box.drop(columns=["hsaves", "asaves", "year", "hgoals", "agoals"], errors="ignore")

# Keep only WC years 2002–2018
valid_years = ["WC-2002", "WC-2006", "WC-2010", "WC-2014", "WC-2018"]
box = box[box["tournament_id"].isin(valid_years)].reset_index(drop=True)

In [25]:
# ------------------------------------------------------
# 3. Prepare 2022 datasets
# ------------------------------------------------------

# --- Harmonise team names ---
def clean_team_name(x):
    if pd.isna(x):
        return x
    x = x.title()

    # Fix Iran
    if x.upper() in ["IRAN", "IR IRAN"]:
        return "IR Iran"

    # Fix South Korea
    if x in ["South Korea"]:
        return "Korea Republic"

    return x

matches_2022["team1"] = matches_2022["team1"].apply(clean_team_name)
matches_2022["team2"] = matches_2022["team2"].apply(clean_team_name)

possession_2022["home_team"] = possession_2022["home_team"].apply(clean_team_name)
possession_2022["away_team"] = possession_2022["away_team"].apply(clean_team_name)

In [26]:
# ------------------------------------------------------
# 4. Merge the two 2022 datasets
# ------------------------------------------------------
merged2022 = matches_2022.merge(
    possession_2022[["home_team", "away_team", "home_possession", "away_possession"]],
    left_on=["team1", "team2"],
    right_on=["home_team", "away_team"],
    how="left"
)

# Rename 2022 possession to match final dataset columns
merged2022 = merged2022.rename(columns={
    "team1": "hname",
    "team2": "aname",
    "on target attempts team1": "hshotsOnTarget",
    "on target attempts team2": "ashotsOnTarget",
    "total attempts team1": "hshots",
    "total attempts team2": "ashots",
    "yellow cards team1": "hyellowCards",
    "yellow cards team2": "ayellowCards",
    "red cards team1": "hredCards",
    "red cards team2": "aredCards",
    "fouls against team1": "hfouls",
    "fouls against team2": "afouls",
    "home_possession": "hPossession",
    "away_possession": "aPossession"
})

merged2022["tournament_id"] = "WC-2022"

# Select only columns that match the final structure
merged2022 = merged2022[[
    "hname", "aname", "hshotsOnTarget", "ashotsOnTarget", "hshots", "ashots",
    "hyellowCards", "ayellowCards", "hredCards", "aredCards", "hfouls", "afouls",
    "hPossession", "aPossession", "tournament_id"
]]

In [27]:
# ------------------------------------------------------
# 5. Append 2022 rows to box dataset
# ------------------------------------------------------
final = pd.concat([box, merged2022], ignore_index=True)

In [28]:
# ------------------------------------------------------
# 6. Overwrite 2022 rows in box data with correct home/away teams + possession
# ------------------------------------------------------
mask2022 = final["tournament_id"] == "WC-2022"

final.loc[mask2022, "hname"] = final.loc[mask2022, "hname"]
final.loc[mask2022, "aname"] = final.loc[mask2022, "aname"]
final.loc[mask2022, "hPossession"] = final.loc[mask2022, "hPossession"]
final.loc[mask2022, "aPossession"] = final.loc[mask2022, "aPossession"]

In [29]:
# ------------------------------------------------------
# 7. Save file
# ------------------------------------------------------
final.to_csv("../../data-processed/MatchBoxData_clean.csv", index=False)
print("Saved → ../../data-processed/MatchBoxData_clean.csv")

Saved → ../../data-processed/MatchBoxData_clean.csv
