In [3]:
import pandas as pd
players = pd.read_csv("../../data-csv/players.csv")
players = players.copy()

In [4]:
# ------------------------------------------------------
# 2. Filter tournaments
# ------------------------------------------------------
# Keep only male players (female == 0)
players = players[players["female"] == 0].reset_index(drop=True)

target_years = ["2002", "2006", "2010", "2014", "2018", "2022"]

def has_valid_tournament(x):
    if not isinstance(x, str):
        return False
    # Split by comma and strip whitespace to get individual years
    years = [y.strip() for y in x.split(",")]
    # Check if any of the player's years overlap with target years
    return any(year in target_years for year in years)

players = players[players["list_tournaments"].apply(has_valid_tournament)].reset_index(drop=True)
players = players.drop(
    columns="female",
    errors="ignore"
)

In [5]:
# ------------------------------------------------------
# 3. Basic column cleanup
# ------------------------------------------------------
# Convert birth_date to datetime
players["birth_date"] = pd.to_datetime(
    players["birth_date"],
    errors="coerce"
)

# Ensure position columns are booleans
position_cols = ["goal_keeper", "defender", "midfielder", "forward"]
for col in position_cols:
    if col in players.columns:
        players[col] = players[col].astype("boolean")

In [6]:
players.to_csv("../../data-processed/players_clean.csv", index=False)
print("Saved → ../../data-processed/players_clean.csv")

Saved → ../../data-processed/players_clean.csv
