In [2]:
import pandas as pd

# ------------------------------------------------------
# 1. Load datasets
# ------------------------------------------------------
# Load the clean matches file to get the valid team codes
matches = pd.read_csv("../../data-processed/matches_clean.csv")

# Load the FIFA ranking dataset
rankings = pd.read_csv("../../data-csv/fifa_ranking-2023-07-20.csv")

In [3]:
# ------------------------------------------------------
# 2. Extract valid country codes
# ------------------------------------------------------
# Get unique codes from both home and away columns in the matches file
home_codes = matches["home_team_code"].unique()
away_codes = matches["away_team_code"].unique()

# Combine into a single set of valid codes
valid_codes = set(home_codes) | set(away_codes)

In [6]:
# ------------------------------------------------------
# 3. Clean Ranking Data
# ------------------------------------------------------
# Drop unwanted columns
cols_to_drop = ["total_points", "previous_points", "rank_change", "confederation"]
rankings = rankings.drop(columns=cols_to_drop, errors="ignore")

# Convert rank_date to datetime objects
rankings["rank_date"] = pd.to_datetime(rankings["rank_date"], format="mixed", dayfirst=True)
# ------------------

# Filter for specific years: 2002, 2006, 2010, 2014, 2018, 2022
target_years = [2002, 2006, 2010, 2014, 2018, 2022]
rankings = rankings[rankings["rank_date"].dt.year.isin(target_years)]

# Filter for rows where country_abrv is in valid_codes list
rankings = rankings[rankings["country_abrv"].isin(valid_codes)]

In [7]:
# ------------------------------------------------------
# 4. Filter by Date (Nearest date to tournament start)
# ------------------------------------------------------
# Define tournament start dates
tournament_starts = pd.DataFrame({
    "tournament_date": pd.to_datetime([
        "31/05/2002", 
        "09/06/2006", 
        "11/06/2010", 
        "12/06/2014", 
        "14/06/2018", 
        "20/11/2022"
    ], dayfirst=True)
})

# Sort tournament dates to ensure merge_asof works correctly
tournament_starts = tournament_starts.sort_values("tournament_date")

# Get the unique ranking dates present in filtered ranking data
unique_rank_dates = pd.DataFrame({
    "rank_date": rankings["rank_date"].unique()
})
unique_rank_dates = unique_rank_dates.sort_values("rank_date")

# Use merge_asof to find the nearest rank_date to tournament_date
# direction='backward' to ensure dates less than or equal to the tournament date
date_map = pd.merge_asof(
    tournament_starts,
    unique_rank_dates,
    left_on="tournament_date",
    right_on="rank_date",
    direction="backward"
)

# Identify the specific ranking dates to keep
valid_dates = date_map["rank_date"].unique()

# Filter the main dataframe to only keep observations from these specific dates
final_rankings = rankings[rankings["rank_date"].isin(valid_dates)]

In [8]:
# ------------------------------------------------------
# 5. Save the clean file
# ------------------------------------------------------
output_path = "../../data-processed/ranking_clean.csv"
final_rankings.to_csv(output_path, index=False)

print(f"Cleaning complete. File saved to {output_path}")
print(f"Selected Ranking Dates: {valid_dates}")

Cleaning complete. File saved to ../../data-processed/ranking_clean.csv
Selected Ranking Dates: <DatetimeArray>
['2002-05-15 00:00:00', '2006-05-17 00:00:00', '2010-05-26 00:00:00',
 '2014-06-05 00:00:00', '2018-06-07 00:00:00', '2022-10-06 00:00:00']
Length: 6, dtype: datetime64[ns]
