In [3]:
import pandas as pd
from pathlib import Path
import re

# Paths
raw_data_path = Path("data/raw")
processed_path = Path("data/processed")
processed_path.mkdir(exist_ok=True, parents=True)

# Function to parse one file
def parse_wc_file(file_path):
    year = int(file_path.name.split("_")[0])
    matches = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or line.startswith("["):
                continue  # skip metadata and empty lines

            # Regex: HomeTeam - AwayTeam  X-Y
            m = re.match(r"([A-Za-z\s\.\-]+?)\s*-\s*([A-Za-z\s\.\-]+?)\s+(\d+)-(\d+)", line)
            if m:
                home_team, away_team, home_goals, away_goals = m.groups()
                matches.append({
                    "world_cup_year": year,
                    "home_team": home_team.strip(),
                    "away_team": away_team.strip(),
                    "home_goals": int(home_goals),
                    "away_goals": int(away_goals)
                })
            else:
                continue  # ignore goal scorer lines

    df = pd.DataFrame(matches)
    return df

# Loop through all files and merge
all_dfs = []

for file in sorted(raw_data_path.glob("*_fifa_world_cup.txt")):
    df = parse_wc_file(file)
    all_dfs.append(df)
    print(f"Parsed {file.name}: {df.shape[0]} matches")

# Combine all years
master_df = pd.concat(all_dfs, ignore_index=True)
print("Total matches in master dataset:", master_df.shape[0])

# Save master CSV
output_file = processed_path / "fifa_world_cup_matches_1930_2022.csv"
master_df.to_csv(output_file, index=False)
print(f"Master CSV saved at: {output_file}")


Parsed 1930_fifa_world_cup.txt: 18 matches
Parsed 1934_fifa_world_cup.txt: 17 matches
Parsed 1938_fifa_world_cup.txt: 18 matches
Parsed 1950_fifa_world_cup.txt: 22 matches
Parsed 1954_fifa_world_cup.txt: 26 matches
Parsed 1958_fifa_world_cup.txt: 35 matches
Parsed 1962_fifa_world_cup.txt: 32 matches
Parsed 1966_fifa_world_cup.txt: 32 matches
Parsed 1970_fifa_world_cup.txt: 32 matches
Parsed 1974_fifa_world_cup.txt: 38 matches
Parsed 1978_fifa_world_cup.txt: 38 matches
Parsed 1982_fifa_world_cup.txt: 52 matches
Parsed 1986_fifa_world_cup.txt: 52 matches
Parsed 1990_fifa_world_cup.txt: 52 matches
Parsed 1994_fifa_world_cup.txt: 52 matches
Parsed 1998_fifa_world_cup.txt: 64 matches
Parsed 2002_fifa_world_cup.txt: 64 matches
Parsed 2006_fifa_world_cup.txt: 64 matches
Parsed 2010_fifa_world_cup.txt: 64 matches
Parsed 2014_fifa_world_cup.txt: 64 matches
Parsed 2018_fifa_world_cup.txt: 64 matches
Parsed 2022_fifa_world_cup.txt: 64 matches
Total matches in master dataset: 964
Master CSV saved 