In [None]:
# 03_fifa_wc_prediction_data_prep.ipynb
import pandas as pd
from pathlib import Path
import re

# Paths
raw_data_path = Path("data/raw")
processed_path = Path("data/processed")
processed_path.mkdir(exist_ok=True, parents=True)

# Step 1: Parse and merge all .txt files
def parse_wc_file(file_path):
    year = int(file_path.name.split("_")[0])
    matches = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or line.startswith("["):
                continue
            m = re.match(r"([A-Za-z\s\.\-]+?)\s*-\s*([A-Za-z\s\.\-]+?)\s+(\d+)-(\d+)", line)
            if m:
                home_team, away_team, home_goals, away_goals = m.groups()
                matches.append({
                    "world_cup_year": year,
                    "home_team": home_team.strip(),
                    "away_team": away_team.strip(),
                    "home_goals": int(home_goals),
                    "away_goals": int(away_goals)
                })
    return pd.DataFrame(matches)

all_dfs = []
for file in sorted(raw_data_path.glob("*_fifa_world_cup.txt")):
    df_year = parse_wc_file(file)
    all_dfs.append(df_year)
    print(f"Parsed {file.name}: {df_year.shape[0]} matches")

master_df = pd.concat(all_dfs, ignore_index=True)
print("Total matches:", master_df.shape[0])

# Step 2: Clean team names
team_map = {
    "West Germany": "Germany",
    "USSR": "Russia",
    "Czechoslovakia": "Czech Republic",
    "Yugoslavia": "Serbia",
    "Brasil": "Brazil"
}
master_df["home_team"] = master_df["home_team"].replace(team_map)
master_df["away_team"] = master_df["away_team"].replace(team_map)

# Step 3: Match outcome
def outcome(row):
    if row["home_goals"] > row["away_goals"]:
        return "H"
    elif row["home_goals"] < row["away_goals"]:
        return "A"
    else:
        return "D"
master_df["match_outcome"] = master_df.apply(outcome, axis=1)
master_df["goal_diff"] = master_df["home_goals"] - master_df["away_goals"]

# Step 4: Host advantage
hosts = {
    1930: "Uruguay",1934: "Italy",1938: "France",1950: "Brazil",1954: "Switzerland",
    1958: "Sweden",1962: "Chile",1966: "England",1970: "Mexico",1974: "West Germany",
    1978: "Argentina",1982: "Spain",1986: "Mexico",1990: "Italy",1994: "USA",
    1998: "France",2002: "South Korea",2002: "Japan",2006: "Germany",2010: "South Africa",
    2014: "Brazil",2018: "Russia",2022: "Qatar"
}
master_df["home_host_advantage"] = master_df.apply(lambda x: 1 if x["home_team"] == hosts.get(x["world_cup_year"], "") else 0, axis=1)

# Step 5: Simple team strength
team_stats = {}
home_strength = []
away_strength = []

for idx, row in master_df.iterrows():
    home = row["home_team"]
    away = row["away_team"]
    home_wins = team_stats.get(home, {"wins":0, "matches":0})
    away_wins = team_stats.get(away, {"wins":0, "matches":0})

    home_strength.append(home_wins["wins"] / home_wins["matches"] if home_wins["matches"]>0 else 0.5)
    away_strength.append(away_wins["wins"] / away_wins["matches"] if away_wins["matches"]>0 else 0.5)

    if row["match_outcome"]=="H":
        home_wins["wins"] +=1
    elif row["match_outcome"]=="A":
        away_wins["wins"] +=1

    home_wins["matches"] = home_wins.get("matches",0)+1
    away_wins["matches"] = away_wins.get("matches",0)+1

    team_stats[home] = home_wins
    team_stats[away] = away_wins

master_df["home_strength"] = home_strength
master_df["away_strength"] = away_strength

# Step 6: Save final ML-ready dataset
output_file = processed_path / "fifa_wc_features.csv"
master_df.to_csv(output_file, index=False)
print("Final ML-ready dataset saved:", output_file)
master_df.head()


Parsed 1930_fifa_world_cup.txt: 18 matches
Parsed 1934_fifa_world_cup.txt: 17 matches
Parsed 1938_fifa_world_cup.txt: 18 matches
Parsed 1950_fifa_world_cup.txt: 22 matches
Parsed 1954_fifa_world_cup.txt: 26 matches
Parsed 1958_fifa_world_cup.txt: 35 matches
Parsed 1962_fifa_world_cup.txt: 32 matches
Parsed 1966_fifa_world_cup.txt: 32 matches
Parsed 1970_fifa_world_cup.txt: 32 matches
Parsed 1974_fifa_world_cup.txt: 38 matches
Parsed 1978_fifa_world_cup.txt: 38 matches
Parsed 1982_fifa_world_cup.txt: 52 matches
Parsed 1986_fifa_world_cup.txt: 52 matches
Parsed 1990_fifa_world_cup.txt: 52 matches
Parsed 1994_fifa_world_cup.txt: 52 matches
Parsed 1998_fifa_world_cup.txt: 64 matches
Parsed 2002_fifa_world_cup.txt: 64 matches
Parsed 2006_fifa_world_cup.txt: 64 matches
Parsed 2010_fifa_world_cup.txt: 64 matches
Parsed 2014_fifa_world_cup.txt: 64 matches
Parsed 2018_fifa_world_cup.txt: 64 matches
Parsed 2022_fifa_world_cup.txt: 64 matches
Total matches: 964


NameError: name 'master_df_' is not defined