In [1]:
import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')

## Import Historical and WorldCup 2022 Data

In [2]:
def remove_whitespaces(df: pd.DataFrame) -> None:
    """Remove whitespaces from column names and string values

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to clean
    """
    # Remove whitespace from each column name
    df.columns = df.columns.str.strip()
    
    # Remove whitespace from each string value
    categorical_columns = df.select_dtypes("O").columns
    for column in categorical_columns:
        df[column] = df[column].str.strip()

In [3]:
HIST_PLAYERS_DIR = "../data/players_data"
WORLDCUP_PLAYERS_DATA_PATH = "../data/world_cup_2022_data/players_worldcup_2022.xlsx"

# Load the WorldCup players Data
players_worldCup = pd.read_excel(WORLDCUP_PLAYERS_DATA_PATH)

# Load the historical players data
all_files = glob.glob(os.path.join(HIST_PLAYERS_DIR , "*.csv"))
temp_list = []

for filename in all_files:
    temp_df = pd.read_csv(filename, index_col=None, header=0)
    year = int(filename.split(".")[-2][-2:]) + 2000
    temp_df["year"] = year


    temp_list.append(temp_df)

players_hist = pd.concat(temp_list, axis=0, ignore_index=True)

# Remove whitespaces from column names and string values
remove_whitespaces(players_worldCup)
remove_whitespaces(players_hist)

In [4]:
# Add year column to worldCup Data
players_worldCup["year"] = 2023

# Keep only features found in the world cup data 
players_hist = players_hist[players_worldCup.columns.to_list()]


In [5]:
# Concatenate the two datasets
players_all = pd.concat([players_worldCup, players_hist], axis=0, ignore_index=True)

In [6]:
# players_all.to_excel("temp_player.xlsx", index=False)

In [7]:
# Print nulls count
players_all.isnull().sum()

long_name                           0
height_cm                           0
weight_kg                           0
age                                 0
overall                             0
potential                           0
sofifa_id                          13
nation_position                133643
nationality_name                    4
attacking_crossing                  0
attacking_finishing                 2
attacking_heading_accuracy          2
attacking_short_passing             2
attacking_volleys                   2
skill_dribbling                     0
skill_curve                         0
skill_fk_accuracy                   0
skill_long_passing                  0
skill_ball_control                  0
movement_acceleration               0
movement_sprint_speed               0
movement_agility                    0
movement_reactions                  0
movement_balance                    0
power_shot_power                    0
power_jumping                       0
power_stamin

In [8]:
players_all["nationality_name"].unique()

array(['Qatar', nan, 'Ecuador', '1667', 'Netherlands', 'England',
       '111115', 'Al Raed', 'United States', 'Wales', 'Argentina',
       'Atlético Madrid', 'Atlanta United', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia', '1383',
       'Germany', 'Spain', 'FC Barcelona', 'Manchester City', '1411',
       'Canada', 'Belgium', 'Morocco', 'Al Ain', 'Croatia', 'Brazil',
       'Serbia', 'Red Star', 'Cameroon', 'Switzerland', 'Portugal',
       'Ghana', 'Uruguay', 'Al Nassr', 'Korea Republic', 'Slovenia',
       'Senegal', 'Egypt', 'Italy', 'Costa Rica', 'Norway', 'Scotland',
       'Algeria', 'Slovakia', 'Hungary', 'Gabon', 'Nigeria', 'Sweden',
       'Austria', 'Montenegro', "Côte d'Ivoire", 'Bosnia and Herzegovina',
       'Finland', 'Greece', 'Armenia', 'Colombia', 'Russia', 'Turkey',
       'Jamaica', 'Czech Republic', 'Chile', 'Ukraine', 'Venezuela',
       'Togo', 'Burkina Faso', 'Northern Ireland', 'Congo DR', 'Israel',
       'Albania', '

In [11]:
to_rename = {"1667": "Senegal", "111115": "Iran", "1383": "Costa Rica", "1411": "Japan"}
clubs_to_drop = [
    "Atlético Madrid",
    "Atlanta United",
    "FC Barcelona",
    "Manchester City",
    "Al Nassr"
]

# Rename some nations
players_all["nationality_name"].replace(to_rename, inplace=True)

# Drop some clubs
players_all = players_all[~players_all["nationality_name"].isin(clubs_to_drop)]

In [12]:
filtered_players_all = players_all[~players_all["nation_position"].isnull()].reset_index(drop=True)

In [14]:
for year in filtered_players_all["year"].unique():
    # for player in filtered_players_all[filtered_players_all["year"] == year].iterrows():
    #     if player["nation_position"] == "GK":
    temp_df =  filtered_players_all[filtered_players_all["year"] == year]
    for nation in temp_df["nationality_name"].unique():
        positions_count = temp_df[(temp_df["nationality_name"] == nation) & ~(temp_df["nation_position"].isnull())].shape[0]
        if temp_df[(temp_df["nationality_name"] == nation) ].shape[0] < 12:
            print(f"Year: {year} | Nation: {nation} | Positions: {positions_count}")


In [18]:
filtered_players_all[filtered_players_all["year"] == 2023]["nationality_name"].unique()

array(['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran',
       'United States', 'Wales', 'Argentina', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia',
       'Costa Rica', 'Germany', 'Spain', 'Japan', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Korea Republic'],
      dtype=object)

In [16]:
filtered_players_all["nationality_name"].unique()

array(['Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran',
       'United States', 'Wales', 'Argentina', 'Saudi Arabia', 'Mexico',
       'Poland', 'France', 'Australia', 'Denmark', 'Tunisia',
       'Costa Rica', 'Germany', 'Spain', 'Japan', 'Canada', 'Belgium',
       'Morocco', 'Croatia', 'Brazil', 'Serbia', 'Cameroon',
       'Switzerland', 'Portugal', 'Ghana', 'Uruguay', 'Korea Republic',
       'Italy', 'Norway', 'Scotland', 'Hungary', 'Sweden', 'Austria',
       'Finland', 'Czech Republic', 'Ukraine', 'Greece',
       'Northern Ireland', 'Iceland', 'New Zealand', 'China PR', 'Russia',
       'Republic of Ireland', 'Romania', 'Slovenia', 'Egypt', 'Chile',
       "Côte d'Ivoire", 'Colombia', 'Turkey', 'Peru', 'Venezuela',
       'Paraguay', 'South Africa', 'Bulgaria', 'Bolivia', 'India'],
      dtype=object)

In [None]:





]

In [90]:
filtered_players_all[filtered_players_all["nationality_name"] == '1411']


Unnamed: 0,long_name,height_cm,weight_kg,age,overall,potential,sofifa_id,nation_position,nationality_name,attacking_crossing,...,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,year
479,川島 永嗣,185,82,39,74,74,180739.0,SUB,1411,17,...,24.0,10,13,12,73.0,72.0,72.0,75.0,74.0,2023
480,山根 視来,178,72,28,72,72,233169.0,SUB,1411,64,...,43.0,67,71,69,13.0,8.0,8.0,8.0,15.0,2023
481,谷口 彰悟,183,75,30,73,73,233225.0,RES,1411,50,...,45.0,78,74,69,8.0,13.0,11.0,14.0,14.0,2023
482,板仓 晃,186,75,25,75,80,233152.0,RCB,1411,34,...,38.0,76,80,75,13.0,6.0,9.0,10.0,12.0,2023
483,長友 佑都,170,68,35,75,75,194359.0,LB,1411,72,...,44.0,73,74,73,12.0,13.0,8.0,13.0,15.0,2023
484,遠藤 航,178,76,29,80,80,232487.0,LDM,1411,55,...,62.0,78,79,73,9.0,13.0,7.0,11.0,5.0,2023
485,柴崎 岳,175,68,30,75,75,232883.0,RDM,1411,65,...,72.0,70,76,74,10.0,13.0,13.0,9.0,6.0,2023
486,堂安 律,172,70,24,75,81,232639.0,SUB,1411,70,...,58.0,39,36,42,8.0,5.0,6.0,13.0,14.0,2023
487,三笘 薫,178,71,25,74,77,255565.0,SUB,1411,60,...,63.0,56,59,54,10.0,13.0,10.0,14.0,9.0,2023
488,南野 拓実,174,67,27,80,80,226627.0,LM,1411,72,...,72.0,34,33,29,9.0,12.0,8.0,15.0,14.0,2023
