In [2]:
import pandas as pd

# Load raw data
matches = pd.read_csv("../../data-csv/matches.csv")

# Preview
display(matches.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1248 entries, 0 to 1247
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     1248 non-null   int64 
 1   tournament_id              1248 non-null   object
 2   tournament_name            1248 non-null   object
 3   match_id                   1248 non-null   object
 4   match_name                 1248 non-null   object
 5   stage_name                 1248 non-null   object
 6   group_name                 1248 non-null   object
 7   group_stage                1248 non-null   int64 
 8   knockout_stage             1248 non-null   int64 
 9   replayed                   1248 non-null   int64 
 10  replay                     1248 non-null   int64 
 11  match_date                 1248 non-null   object
 12  match_time                 1248 non-null   object
 13  stadium_id                 1248 non-null   object
 14  stadium_

None

In [3]:
# Ensure both columns are strings
matches['match_date'] = matches['match_date'].astype(str)
matches['match_time'] = matches['match_time'].astype(str)

# Combine date and time into one string
datetime_str = matches['match_date'] + ' ' + matches['match_time']

# Convert to datetime
matches['match_datetime'] = pd.to_datetime(datetime_str, format='%Y-%m-%d %H:%M', errors='coerce')

# Preview
matches[['match_date', 'match_time', 'match_datetime']].head()

Unnamed: 0,match_date,match_time,match_datetime
0,1930-07-13,15:00,1930-07-13 15:00:00
1,1930-07-13,15:00,1930-07-13 15:00:00
2,1930-07-14,12:45,1930-07-14 12:45:00
3,1930-07-14,14:50,1930-07-14 14:50:00
4,1930-07-15,16:00,1930-07-15 16:00:00


In [4]:
# Sort by date
matches = matches.sort_values("match_date").reset_index(drop=True)

# Ensure numeric columns are numeric
numeric_cols = [
    "home_team_score",
    "away_team_score"
]
matches[numeric_cols] = matches[numeric_cols].apply(pd.to_numeric, errors="coerce")

In [5]:
# Count missing values
matches.isna().sum()

# Remove rows missing match_time or scores, if any
matches = matches.dropna(subset=["match_time", "home_team_score", "away_team_score"])

In [6]:
# Define the tournaments to keep
tournaments_to_keep = ['WC-2002', 'WC-2006', 'WC-2010', 'WC-2014', 'WC-2018', 'WC-2022']

# Filter the dataframe
matches = matches[matches['tournament_id'].isin(tournaments_to_keep)].reset_index(drop=True)

# Preview
matches['tournament_id'].value_counts()

tournament_id
WC-2014    64
WC-2022    64
WC-2018    64
WC-2010    64
WC-2002    64
WC-2006    64
Name: count, dtype: int64

In [7]:
# Keep only relevant columns
keep_cols = [
    "tournament_id",
    "match_id",
    "stage_name",
    "group_stage",
    "knockout_stage",
    "match_date",
    "match_time",
    "match_datetime",
    "city_name",
    "country_name",
    "home_team_id",
    "home_team_name",
    "home_team_code",
    "away_team_id",
    "away_team_name",
    "away_team_code",
    "home_team_score",
    "away_team_score",
    "home_team_score_margin",
    "away_team_score_margin",
    "extra_time",
    "penalty_shootout"
]

matches_clean = matches[keep_cols].copy()
matches_clean.to_csv("../../data-processed/matches_clean.csv", index=False)
matches_clean.head

<bound method NDFrame.head of     tournament_id   match_id      stage_name  group_stage  knockout_stage  \
0         WC-2014  M-2014-57  quarter-finals            0               1   
1         WC-2022  M-2022-17     group stage            1               0   
2         WC-2022  M-2022-21     group stage            1               0   
3         WC-2014  M-2014-15     group stage            1               0   
4         WC-2014  M-2014-46     group stage            1               0   
..            ...        ...             ...          ...             ...   
379       WC-2022  M-2022-40     group stage            1               0   
380       WC-2022  M-2022-39     group stage            1               0   
381       WC-2014  M-2014-08     group stage            1               0   
382       WC-2022  M-2022-36     group stage            1               0   
383       WC-2022  M-2022-47     group stage            1               0   

     match_date match_time      match_datetim