In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import glob
import os
import warnings
warnings.filterwarnings('ignore')

## Import & Prepare DataFrame

In [2]:
def remove_whitespaces(df: pd.DataFrame) -> None:
    """Remove whitespaces from column names and string values

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to clean
    """
    # Remove whitespace from each column name
    df.columns = df.columns.str.strip()
    
    # Remove whitespace from each string value
    categorical_columns = df.select_dtypes("O").columns
    for column in categorical_columns:
        df[column] = df[column].str.strip()

In [3]:
MATCHES_DATA_PATH = "../data/matches_data/historical_matches.csv"
PLAYERS_DATA_PATH = "../data/players_data/players_all_prepared.csv"

# Load the matches data
matches = pd.read_csv(MATCHES_DATA_PATH)

# Load the players data
players = pd.read_csv(PLAYERS_DATA_PATH)


# Remove whitespaces from column names and string values
remove_whitespaces(matches)
remove_whitespaces(players)

In [4]:
# Matches date to datetime
matches["date"] = pd.to_datetime(matches["date"])
matches["year"] = matches["date"].dt.year

# Filter years in matches to match the given players data dates [2015, 2022]
matches = matches[matches["year"] >= 2014]

# Filter tournaments to keep only ones  having players as in world cup matches
to_keep_tournaments = ["Friendly", 
"AFC Asian Cup qualification",
"EAFF Championship",
"African Cup of Nations qualification",
"FIFA World Cup",
"Kirin Challenge Cup",
"UEFA Euro qualification",
"Superclásico de las Américas",
"Gulf Cup",
"AFC Asian Cup",
"African Cup of Nations",
"FIFA World Cup qualification",
"Copa América",
"Gold Cup",
"Copa América qualification",
"Kirin Cup",
"UEFA Euro",
"Confederations Cup",
"UEFA Nations League",
"CONCACAF Nations League qualification",
"CONCACAF Nations League",
"CONMEBOL–UEFA Cup of Champions"]

In [5]:
## Assert all to_keep torunments are in dataframe
for torn in to_keep_tournaments:
    all_tournaments = matches["tournament"].unique()
    found = matches[matches["tournament"] == torn].count().sum()
    assert found > 0, f"Zero mathces found for tournament {torn}"
    assert (all_tournaments == torn).sum() == 1, f"tournament {torn} NOT found in matches"


In [6]:
# Filter matches to keep only the ones in the tournaments to keep
matches = matches[matches["tournament"].isin(to_keep_tournaments)]

# matches net score
matches["net_score"] = matches["home_score"] - matches["away_score"]

# Keep only desired columns
matches = matches[["year", "home_team", "away_team", "net_score"]]

# Rename columns
matches.rename(columns={"home_team": "1st_team", "away_team": "2nd_team"}, inplace=True)

In [7]:
matches.reset_index(drop=True, inplace=True)
players.reset_index(drop=True, inplace=True)

## Merge Data

In [8]:
## Adjust year value. The FIFA data collected in 2023 represents the year 2022, and this applies to all the data.
## This adjusment is to correctly merge matches with the corresponding players data 
players["year"] -= 1


In [9]:
## get unique positions
all_positions = players["nation_position"].unique()
all_positions

array(['GK', 'RWB', 'LCB', 'SUB', 'LCM', 'RW', 'LW', 'RCM', 'LWB', 'RCB',
       'CB', 'ST', 'RES', 'LB', 'RDM', 'LM', 'RB', 'RM', 'LDM', 'CDM',
       'RS', 'CAM', 'LS', 'LF', 'RF', 'CF', 'CM', 'LAM', 'RAM'],
      dtype=object)

In [10]:
drop_ind = []
for i, row in matches.iterrows():
    year = row["year"]
    team1 = row["1st_team"].lower().strip()
    team2 = row["2nd_team"].lower().strip()

    # Check if team1 is in players data
    team1_players = players[(players["year"] == year) & (players["nationality_name"].str.lower() == team1)]
    if team1_players.empty:
        drop_ind.append(i)
        continue

    # Check if team2 is in players data 
    team2_players = players[(players["year"] == year) & (players["nationality_name"].str.lower()== team2)]
    if team2_players.empty:
        drop_ind.append(i)
        continue

In [11]:
# invalid matches ratio
len(drop_ind)/matches.shape[0]

0.8362068965517241

In [12]:
matches.drop(drop_ind, inplace=True)

In [13]:
grouped_players = players.groupby(["nationality_name", "year", "nation_position"]).mean()

In [14]:
# dataset = matches.copy().reset_index(drop=True)
matches.reset_index(drop=True, inplace=True)
error_count = 0
counter = 0
dataset = pd.DataFrame()
for i, match in matches.iterrows():
    match_dataframe = pd.DataFrame()
    match_dataframe = pd.DataFrame(columns=["1st_team", "2nd_team", "year", "net_score"])
    first_team = match["1st_team"]
    second_team =  match["2nd_team"]
    year = match["year"]
    match_dataframe.loc[0, "1st_team"] = match["1st_team"]
    match_dataframe.loc[0, "2nd_team"] = match["2nd_team"]
    match_dataframe.loc[0, "year"] = match["year"]
    match_dataframe.loc[0, "net_score"] = match["net_score"]
    # print(match_dataframe)
    counter+=1
    for pos in all_positions:
        # new_row = pd.DataFrame()
        try:
            # Get players data for first team
            plyrs_record1 = pd.DataFrame(grouped_players.loc[first_team, year, pos])
            plyrs_record2 = pd.DataFrame(grouped_players.loc[second_team, year, pos])
            plyrs_record1 = plyrs_record1.T.reset_index(drop=True)
            plyrs_record2 = plyrs_record2.T.reset_index(drop=True)
        except:
            plyrs_record2 = pd.DataFrame(np.zeros((1,grouped_players.columns.shape[0])), columns=grouped_players.columns)
            plyrs_record1 = pd.DataFrame(np.zeros((1,grouped_players.columns.shape[0])), columns=grouped_players.columns)

        
        
        plyrs_record1.columns = pos + "_" + plyrs_record1.columns + "_1st"
        plyrs_record2.columns = pos + "_" + plyrs_record2.columns + "_2nd"
        match_dataframe = pd.concat([match_dataframe, plyrs_record1, plyrs_record2], axis=1)
        # dataset_cols = dataset.columns
        # final_ds = pd.concat([pd.DataFrame(dataset.loc[i]).T, plyrs_record2], axis=1, ignore_index=True)
        # final_ds = pd.concat([pd.DataFrame(dataset.loc[i]).T, plyrs_record1], axis=1, ignore_index=True)
        # final_ds.columns = dataset_cols.tolist() + plyrs_record2.columns.tolist() + plyrs_record1.columns.tolist()
        # print(new_row)
    # dataset = pd.concat([dataset, new_row], axis=0, ignore_index=True)
        # break
    dataset = pd.concat([dataset, match_dataframe], axis=0, ignore_index=True)
    # dataset = pd.concat([dataset, pd.DataFrame(match).T], axis=0, ignore_index=True)

    
        
        
        
    

In [15]:
dataset

Unnamed: 0,1st_team,2nd_team,year,net_score,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd
0,Norway,Poland,2014,-3.0,192.0,78.0,23.0,72.0,75.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Australia,Ecuador,2014,-1.0,182.0,84.0,22.0,73.0,79.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Austria,Uruguay,2014,0.0,194.0,85.0,30.0,64.0,64.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Czech Republic,Norway,2014,0.0,196.0,90.0,32.0,85.0,85.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,England,Denmark,2014,1.0,196.0,91.0,27.0,82.0,83.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116,Canada,Uruguay,2022,-2.0,195.0,84.0,34.0,75.0,75.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1117,Ecuador,Japan,2022,0.0,195.0,81.0,35.0,74.0,74.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1118,Iran,Senegal,2022,0.0,194.0,85.0,29.0,74.0,75.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1119,Saudi Arabia,United States,2022,0.0,185.0,79.0,30.0,71.0,71.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
dataset.describe()

Unnamed: 0,GK_height_cm_1st,GK_weight_kg_1st,GK_age_1st,GK_overall_1st,GK_potential_1st,GK_attacking_crossing_1st,GK_attacking_finishing_1st,GK_attacking_heading_accuracy_1st,GK_attacking_short_passing_1st,GK_attacking_volleys_1st,...,RAM_mentality_vision_2nd,RAM_mentality_penalties_2nd,RAM_defending_marking_awareness_2nd,RAM_defending_standing_tackle_2nd,RAM_defending_sliding_tackle_2nd,RAM_goalkeeping_diving_2nd,RAM_goalkeeping_handling_2nd,RAM_goalkeeping_kicking_2nd,RAM_goalkeeping_positioning_2nd,RAM_goalkeeping_reflexes_2nd
count,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,...,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0,1121.0
mean,188.286351,83.785013,29.322926,78.155219,79.416592,15.512043,14.049063,16.19893,33.038359,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,19.293344,10.046463,4.804304,10.078043,10.227172,4.569511,4.948151,5.234044,10.663495,5.147815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,187.0,80.0,27.0,74.0,75.0,12.0,11.0,12.0,27.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,190.0,84.0,30.0,79.0,80.0,15.0,13.0,15.0,32.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,193.0,89.0,32.0,83.0,85.0,18.0,15.0,19.0,38.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,203.0,96.0,44.0,92.0,93.0,25.0,25.0,32.0,60.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
dataset.to_csv("../data/train_data/dataset.csv", index=False)