In [1]:
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')

## Import Historical and WorldCup 2022 Data

In [2]:
def remove_whitespaces(df: pd.DataFrame) -> None:
    """Remove whitespaces from column names and string values

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to clean
    """
    # Remove whitespace from each column name
    df.columns = df.columns.str.strip()
    
    # Remove whitespace from each string value
    categorical_columns = df.select_dtypes("O").columns
    for column in categorical_columns:
        df[column] = df[column].str.strip()

In [4]:
HIST_PLAYERS_DIR = "../data/players_data"
WORLDCUP_PLAYERS_DATA_PATH = "../data/world_cup_2022_data/players_worldcup_2022.xlsx"

# Load the WorldCup players Data
players_worldCup = pd.read_excel(WORLDCUP_PLAYERS_DATA_PATH)

# Load the historical players data
all_files = glob.glob(os.path.join(HIST_PLAYERS_DIR , "*.csv"))
temp_list = []

files_list = ["players_15.csv", "players_16.csv", "players_17.csv", "players_18.csv", "players_19.csv", "players_20.csv", "players_21.csv", "players_22.csv"]

for filename in all_files:
    if filename.split("/")[-1] not in files_list:
        continue
    temp_df = pd.read_csv(filename, index_col=None, header=0)
    year = int(filename.split(".")[-2][-2:]) + 2000
    temp_df["year"] = year


    temp_list.append(temp_df)

players_hist = pd.concat(temp_list, axis=0, ignore_index=True)

# Remove whitespaces from column names and string values
remove_whitespaces(players_worldCup)
remove_whitespaces(players_hist)

In [5]:
# Add year column to worldCup Data
players_worldCup["year"] = 2023

# Keep only features found in the world cup data 
players_hist = players_hist[players_worldCup.columns.to_list()]


In [6]:
# Concatenate the two datasets
players_all = pd.concat([players_worldCup, players_hist], axis=0, ignore_index=True)

## Data Wrangling

In [7]:
# Print nulls count
players_all.isnull().sum()

long_name                           0
height_cm                           0
weight_kg                           0
age                                 0
overall                             0
potential                           0
sofifa_id                          13
nation_position                133643
nationality_name                    4
attacking_crossing                  0
attacking_finishing                 2
attacking_heading_accuracy          2
attacking_short_passing             2
attacking_volleys                   2
skill_dribbling                     0
skill_curve                         0
skill_fk_accuracy                   0
skill_long_passing                  0
skill_ball_control                  0
movement_acceleration               0
movement_sprint_speed               0
movement_agility                    0
movement_reactions                  0
movement_balance                    0
power_shot_power                    0
power_jumping                       0
power_stamin

In [8]:
filtered_players_all = players_all[~players_all["nation_position"].isnull()].reset_index(drop=True)

In [9]:
filtered_players_all["nationality_name"].unique()

array(['Qatar', 'Ecuador', '1667', 'Netherlands', 'England', '111115',
       'United States', 'Wales', 'Argentina', 'Atlético Madrid',
       'Atlanta United', 'Saudi Arabia', 'Mexico', 'Poland', 'France',
       'Australia', 'Denmark', 'Tunisia', '1383', 'Germany', 'Spain',
       'FC Barcelona', 'Manchester City', '1411', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Al Nassr',
       'Korea Republic', 'Italy', 'Norway', 'Scotland', 'Hungary',
       'Sweden', 'Austria', 'Finland', 'Czech Republic', 'Ukraine',
       'Greece', 'Northern Ireland', 'Iceland', 'New Zealand', 'China PR',
       'Russia', 'Republic of Ireland', 'Romania', 'Slovenia', 'Egypt',
       'Chile', "Côte d'Ivoire", 'Colombia', 'Turkey', 'Peru',
       'Venezuela', 'Paraguay', 'South Africa', 'Bulgaria', 'Bolivia',
       'India'], dtype=object)

In [10]:
to_rename = {"1667": "Senegal", "111115": "Iran", "1383": "Costa Rica", "1411": "Japan"}
clubs_to_drop = [
    "Atlético Madrid",
    "Atlanta United",
    "FC Barcelona",
    "Manchester City",
    "Al Nassr"
]

# Rename some nations
filtered_players_all["nationality_name"].replace(to_rename, inplace=True)

# Drop some clubs
filtered_players_all = filtered_players_all[~filtered_players_all["nationality_name"].isin(clubs_to_drop)]

In [11]:
for year in filtered_players_all["year"].unique():
    temp_df =  filtered_players_all[filtered_players_all["year"] == year]
    for nation in temp_df["nationality_name"].unique():
        positions_count = temp_df[(temp_df["nationality_name"] == nation) & ~(temp_df["nation_position"].isnull())].shape[0]
        if temp_df[(temp_df["nationality_name"] == nation) ].shape[0] < 12:
            print(f"Year: {year} | Nation: {nation} | Positions: {positions_count}")


In [12]:
filtered_players_all[filtered_players_all["year"] == 2023]["nationality_name"].unique()

array(['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran',
       'United States', 'Wales', 'Argentina', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia',
       'Costa Rica', 'Germany', 'Spain', 'Japan', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Korea Republic'],
      dtype=object)

In [13]:
filtered_players_all["nationality_name"].unique()

array(['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran',
       'United States', 'Wales', 'Argentina', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia',
       'Costa Rica', 'Germany', 'Spain', 'Japan', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Korea Republic',
       'Italy', 'Norway', 'Scotland', 'Hungary', 'Sweden', 'Austria',
       'Finland', 'Czech Republic', 'Ukraine', 'Greece',
       'Northern Ireland', 'Iceland', 'New Zealand', 'China PR', 'Russia',
       'Republic of Ireland', 'Romania', 'Slovenia', 'Egypt', 'Chile',
       "Côte d'Ivoire", 'Colombia', 'Turkey', 'Peru', 'Venezuela',
       'Paraguay', 'South Africa', 'Bulgaria', 'Bolivia', 'India'],
      dtype=object)

In [14]:
filtered_players_all.drop(columns="sofifa_id", inplace=True)

In [15]:
assert filtered_players_all.isnull().sum().sum() == 0
print(f"Players data shape: {filtered_players_all.shape}")

Players data shape: (9251, 42)


In [16]:
## Export Data
filtered_players_all.to_csv("players_all_prepared.csv", index=False)