In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import warnings
warnings.filterwarnings('ignore')

## Import & Prepare DataFrame

In [2]:
def remove_whitespaces(df: pd.DataFrame) -> None:
    """Remove whitespaces from column names and string values

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to clean
    """
    # Remove whitespace from each column name
    df.columns = df.columns.str.strip()
    
    # Remove whitespace from each string value
    categorical_columns = df.select_dtypes("O").columns
    for column in categorical_columns:
        df[column] = df[column].str.strip()

In [6]:
MATCHES_DATA_PATH = "../data/matches_data/historical_matches.csv"
PLAYERS_DIR = "../data/players_data"

# Load the matches data
matches = pd.read_csv(MATCHES_DATA_PATH)

# Load the players data
all_files = glob.glob(os.path.join(PLAYERS_DIR , "*.csv"))
temp_list = []

for filename in all_files:
    temp_df = pd.read_csv(filename, index_col=None, header=0)
    year = int(filename.split(".")[-2][-2:]) + 2000
    temp_df["year"] = year


    temp_list.append(temp_df)

players = pd.concat(temp_list, axis=0, ignore_index=True)

# Remove whitespaces from column names and string values
remove_whitespaces(matches)
remove_whitespaces(players)

In [7]:
matches

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False
...,...,...,...,...,...,...,...,...,...
44055,2022-09-27,Norway,Serbia,0.0,2.0,UEFA Nations League,Oslo,Norway,False
44056,2022-09-27,Sweden,Slovenia,1.0,1.0,UEFA Nations League,Stockholm,Sweden,False
44057,2022-09-27,Kosovo,Cyprus,5.0,1.0,UEFA Nations League,Pristina,Kosovo,False
44058,2022-09-27,Greece,Northern Ireland,3.0,1.0,UEFA Nations League,Athens,Greece,False


In [8]:
# Matches date to datetime
matches["date"] = pd.to_datetime(matches["date"])
matches["year"] = matches["date"].dt.year

# Filter years in matches to match the given players data dates [2015, 2022]
matches = matches[matches["year"] >= 2015]

# Filter tournaments to keep only ones  having players as in world cup matches
to_keep_tournaments = ["Friendly", 
"AFC Asian Cup qualification",
"EAFF Championship",
"African Cup of Nations qualification",
"FIFA World Cup",
"Kirin Challenge Cup",
"UEFA Euro qualification",
"Superclásico de las Américas",
"Gulf Cup",
"AFC Asian Cup",
"African Cup of Nations",
"FIFA World Cup qualification",
"Copa América",
"Gold Cup",
"Copa América qualification",
"Kirin Cup",
"UEFA Euro",
"Confederations Cup",
"UEFA Nations League",
"CONCACAF Nations League qualification",
"CONCACAF Nations League",
"CONMEBOL–UEFA Cup of Champions"]

In [9]:
## Assert all to_keep torunments are in dataframe
for torn in to_keep_tournaments:
    all_tournaments = matches["tournament"].unique()
    found = matches[matches["tournament"] == torn].count().sum()
    assert found > 0, f"Zero mathces found for tournament {torn}"
    assert (all_tournaments == torn).sum() == 1, f"tournament {torn} NOT found in matches"


In [10]:
# Filter matches to keep only the ones in the tournaments to keep
matches = matches[matches["tournament"].isin(to_keep_tournaments)]

In [11]:
matches.reset_index(drop=True, inplace=True)
players.reset_index(drop=True, inplace=True)

In [16]:
len(set(desired_cols))

33

In [14]:
desired_cols = [
"sofifa_id", 
"long_name", 
"overall", 
"potential", 
"age", 
"height_cm", 
"weight_kg", 
"nation_position", 
"nation_jersey_number", 
"preferred_foot",  
"attacking_crossing",
"attacking_finishing", 
'attacking_heading_accuracy', 
"attacking_short_passing",
"attacking_volleys",
"skill_dribbling",
"skill_curve",
"skill_fk_accuracy", 
"skill_long_passing",
"skill_ball_control", 
'movement_acceleration',
'movement_sprint_speed',
'movement_agility',
'movement_reactions',
'movement_balance',
'power_shot_power',
'power_jumping', 
'power_stamina',
'power_strength',
'power_long_shots',
'mentality_aggression',
'mentality_interceptions',
'mentality_positioning',
'mentality_vision',
'mentality_penalties',
'defending_marking_awareness',
'defending_standing_tackle',
'defending_sliding_tackle',
'goalkeeping_diving',
'goalkeeping_handling',
'goalkeeping_kicking',
'goalkeeping_positioning',
'goalkeeping_reflexes',
]
players[desired_cols]

Unnamed: 0,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,...,mentality_vision,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,85,95,70,91,88,96,93,94,91,96,...,95,75,20,35,24,6,11,15,14,8
1,71,95,90,85,89,85,79,85,70,88,...,81,90,35,42,19,15,6,12,8,10
2,87,95,90,80,86,88,81,84,77,88,...,76,88,24,32,24,7,11,15,14,11
3,85,83,63,86,86,95,88,87,81,95,...,90,93,35,32,29,9,9,15,15,11
4,94,82,55,94,82,88,85,83,93,91,...,94,83,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142074,25,25,41,36,25,25,21,25,33,26,...,22,31,38,40,35,12,6,15,8,6
142075,22,47,43,25,27,29,20,22,24,30,...,39,47,25,25,25,13,11,12,15,11
142076,39,36,29,43,27,46,41,53,41,44,...,41,55,27,34,29,5,6,14,8,13
142077,49,24,57,40,25,27,24,21,34,32,...,21,31,54,52,51,8,14,7,10,9


## Apply Structuring to Merge

In [None]:
# players[(players["nationality_name"] == "Portugal") & (players["year"] == 2016)].to_csv("temp_csv.csv")

In [None]:
years = list(range(2015, 2023))
temp_list = []
zero_dict = {2015: [], 
             2016: [], 
             2017: [],
             2018: [],
             2019: [],
             2020: [],
             2021: [],
             2022: []}
for year in years:
    print("-"*20 + str(year) + "-"*20)
    for nation in (players["nationality_name"].unique()):
        
        count = players[(players["nationality_name"] == nation) & (players["year"] == year) & (~players["nation_position"].isna()) & (players["nation_position"]!= "SUB")]["sofifa_id"].nunique()
        
        # assert (count == 11) or (count == 0), f"{count} players for {nation} in {year}"
        
        # print(f"{nation} : {count}")
        # print("-"*20)


        if count not in [0,11]:
            temp_list.append(nation)
            print(f"{nation} : {count}")
            print("-"*20)
            print("FLAAAAAAAAAAAAG!!!!!")
        elif count == 0:
            zero_dict[year].append(nation)

--------------------2015--------------------
--------------------2016--------------------
Portugal : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Brazil : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Egypt : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Hungary : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
Côte d'Ivoire : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Greece : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Czech Republic : 7
--------------------
FLAAAAAAAAAAAAG!!!!!
Chile : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
New Zealand : 8
--------------------
FLAAAAAAAAAAAAG!!!!!
Romania : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Paraguay : 8
--------------------
FLAAAAAAAAAAAAG!!!!!
South Africa : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
Bolivia : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
--------------------2017--------------------
--------------------2018--------------------
--------------------2019--------------------
--------------------2020--------------------
----

In [None]:
# temp_df = matches.copy()
for year in zero_dict.keys():
    matches = matches[~(matches["home_team"].isin(zero_dict[year]) & (matches["year"] == year))]
    matches = matches[~(matches["away_team"].isin(zero_dict[year]) & (matches["year"] == year))]

indicies_to_drop = matches[(matches["home_team"].isin(temp_list)) | matches["away_team"].isin(temp_list)].loc[matches["year"] == 2016].index
matches.drop(indicies_to_drop, inplace=True)

NameError: name 'zero_dict' is not defined

In [None]:
# Get copies to merge
players_1 = players.copy()
players_2 = players.copy()

In [None]:
data = players_2.merge( matches, left_on="nationality_name", right_on="away_team")
# data = players_1.merge(data, left_on="nationality_name", right_on="home_team")

In [None]:
temp_df.shape[0]/matches.shape[0]

0.21334418226200164