In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import warnings
warnings.filterwarnings('ignore')

## Import & Prepare DataFrame

In [2]:
def remove_whitespaces(df: pd.DataFrame) -> None:
    """Remove whitespaces from column names and string values

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to clean
    """
    # Remove whitespace from each column name
    df.columns = df.columns.str.strip()
    
    # Remove whitespace from each string value
    categorical_columns = df.select_dtypes("O").columns
    for column in categorical_columns:
        df[column] = df[column].str.strip()

In [3]:
MATCHES_DATA_PATH = "../data/matches_data/historical_matches.csv"
PLAYERS_DIR = "../data/players_data"

# Load the matches data
matches = pd.read_csv(MATCHES_DATA_PATH)

# Load the players data
all_files = glob.glob(os.path.join(PLAYERS_DIR , "*.csv"))
temp_list = []

for filename in all_files:
    temp_df = pd.read_csv(filename, index_col=None, header=0)
    year = int(filename.split(".")[-2][-2:]) + 2000
    temp_df["year"] = year


    temp_list.append(temp_df)

players = pd.concat(temp_list, axis=0, ignore_index=True)

# Remove whitespaces from column names and string values
remove_whitespaces(matches)
remove_whitespaces(players)

In [4]:
# Matches date to datetime
matches["date"] = pd.to_datetime(matches["date"])
matches["year"] = matches["date"].dt.year

# Filter years in matches to match the given players data dates [2015, 2022]
matches = matches[matches["year"] >= 2015]

# Filter tournaments to keep only ones  having players as in world cup matches
to_keep_tournaments = ["Friendly", 
"AFC Asian Cup qualification",
"EAFF Championship",
"African Cup of Nations qualification",
"FIFA World Cup",
"Kirin Challenge Cup",
"UEFA Euro qualification",
"Superclásico de las Américas",
"Gulf Cup",
"AFC Asian Cup",
"African Cup of Nations",
"FIFA World Cup qualification",
"Copa América",
"Gold Cup",
"Copa América qualification",
"Kirin Cup",
"UEFA Euro",
"Confederations Cup",
"UEFA Nations League",
"CONCACAF Nations League qualification",
"CONCACAF Nations League",
"CONMEBOL–UEFA Cup of Champions"]

In [5]:
## Assert all to_keep torunments are in dataframe
for torn in to_keep_tournaments:
    all_tournaments = matches["tournament"].unique()
    found = matches[matches["tournament"] == torn].count().sum()
    assert found > 0, f"Zero mathces found for tournament {torn}"
    assert (all_tournaments == torn).sum() == 1, f"tournament {torn} NOT found in matches"


In [6]:
# Filter matches to keep only the ones in the tournaments to keep
matches = matches[matches["tournament"].isin(to_keep_tournaments)]

In [7]:
matches.reset_index(drop=True, inplace=True)
players.reset_index(drop=True, inplace=True)

## Apply Structuring to Merge

In [8]:
# Get copies to merge
players_1 = players.copy()
players_2 = players.copy()

In [11]:
data = players_1.merge( matches, left_on="nationality_name", right_on="home_team")

In [30]:
players[(players["nationality_name"] == "Portugal") & (players["year"] == 2016)].to_csv("temp_csv.csv")

In [42]:
years = list(range(2015, 2023))
temp_list = []
zero_dict = {2015: [], 
             2016: [], 
             2017: [],
             2018: [],
             2019: [],
             2020: [],
             2021: [],
             2022: []}
for year in years:
    print("-"*20 + str(year) + "-"*20)
    for nation in (players["nationality_name"].unique()):
        
        count = players[(players["nationality_name"] == nation) & (players["year"] == year) & (~players["nation_position"].isna()) & (players["nation_position"]!= "SUB")]["sofifa_id"].nunique()
        
        # assert (count == 11) or (count == 0), f"{count} players for {nation} in {year}"
        
        # print(f"{nation} : {count}")
        # print("-"*20)


        if count not in [0,11]:
            temp_list.append(nation)
            print(f"{nation} : {count}")
            print("-"*20)
            print("FLAAAAAAAAAAAAG!!!!!")
        elif count == 0:
            zero_dict[year].append(nation)

--------------------2015--------------------
--------------------2016--------------------
Portugal : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Brazil : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Egypt : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Hungary : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
Côte d'Ivoire : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Greece : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Czech Republic : 7
--------------------
FLAAAAAAAAAAAAG!!!!!
Chile : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
New Zealand : 8
--------------------
FLAAAAAAAAAAAAG!!!!!
Romania : 10
--------------------
FLAAAAAAAAAAAAG!!!!!
Paraguay : 8
--------------------
FLAAAAAAAAAAAAG!!!!!
South Africa : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
Bolivia : 9
--------------------
FLAAAAAAAAAAAAG!!!!!
--------------------2017--------------------
--------------------2018--------------------
--------------------2019--------------------
--------------------2020--------------------
----

In [61]:
temp_df = matches.copy()
for year in zero_dict.keys():
    temp_df = temp_df[~(temp_df["home_team"].isin(zero_dict[year]) & (temp_df["year"] == year))]
    temp_df = temp_df[~(temp_df["away_team"].isin(zero_dict[year]) & (temp_df["year"] == year))]

indicies_to_drop = temp_df[(temp_df["home_team"].isin(temp_list)) | temp_df["away_team"].isin(temp_list)].loc[temp_df["year"] == 2016].index
temp_df.drop(indicies_to_drop, inplace=True)

In [62]:
temp_df.shape[0]/matches.shape[0]

0.21334418226200164

In [76]:
players[players["nationality_name"] == "Qatar"]

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url,year
64267,234051,https://sofifa.com/player/234051/akram-afif/18...,A. Afif,Akram Hassan Afif,"LM, RM",68,77,1400000.0,7000.0,20,...,42+1,42+1,48+1,15+1,https://cdn.sofifa.net/players/234/051/18_120.png,https://cdn.sofifa.net/teams/2013/60.png,https://cdn.sofifa.net/flags/be.png,,https://cdn.sofifa.net/flags/qa.png,2018
66501,239861,https://sofifa.com/player/239861/abdelkarim-fa...,A. Fadlalla,Abdelkarim Hassan Al Haj Fadlalla,"LB, LWB",66,72,750000.0,3000.0,23,...,65+1,65+1,65+1,15+1,https://cdn.sofifa.net/players/239/861/18_120.png,https://cdn.sofifa.net/teams/2013/60.png,https://cdn.sofifa.net/flags/be.png,,https://cdn.sofifa.net/flags/qa.png,2018
67535,245688,https://sofifa.com/player/245688/yasser-ahmed/...,Y. Ahmed,Ahmed Yasser Mohammedi Abdelrehman,CB,65,70,625000.0,3000.0,23,...,64+1,64+1,61+1,14+1,https://cdn.sofifa.net/players/245/688/18_120.png,https://cdn.sofifa.net/teams/15012/60.png,https://cdn.sofifa.net/flags/es.png,,https://cdn.sofifa.net/flags/qa.png,2018
69451,239862,https://sofifa.com/player/239862/hamza-sanhaji...,H. Sanhaji,Hamza Sanhaji,"ST, LM, RM",63,68,500000.0,3000.0,23,...,35+1,35+1,41+1,14+1,https://cdn.sofifa.net/players/239/862/18_120.png,https://cdn.sofifa.net/teams/2013/60.png,https://cdn.sofifa.net/flags/be.png,,https://cdn.sofifa.net/flags/qa.png,2018
69497,241003,https://sofifa.com/player/241003/ahmad-moein/1...,A. Moein,Ahmed Moein Doozandeh,CM,63,72,550000.0,2000.0,21,...,53+1,53+1,57+1,16+1,https://cdn.sofifa.net/players/241/003/18_120.png,https://cdn.sofifa.net/teams/15012/60.png,https://cdn.sofifa.net/flags/es.png,,https://cdn.sofifa.net/flags/qa.png,2018
70943,239878,https://sofifa.com/player/239878/assim-madibo/...,A. Madibo,Assim Omer Al Haj Madibo,CDM,61,70,350000.0,2000.0,20,...,59+1,59+1,57+1,16+1,https://cdn.sofifa.net/players/239/878/18_120.png,https://cdn.sofifa.net/teams/2013/60.png,https://cdn.sofifa.net/flags/be.png,,https://cdn.sofifa.net/flags/qa.png,2018
99115,234051,https://sofifa.com/player/234051/akram-afif/17...,A. Afif,Akram Hassan Afif,"LW, ST",69,79,1500000.0,10000.0,19,...,41+1,41+1,48+1,15+1,https://cdn.sofifa.net/players/234/051/17_120.png,https://cdn.sofifa.net/teams/459/60.png,https://cdn.sofifa.net/flags/es.png,,https://cdn.sofifa.net/flags/qa.png,2017
105773,234043,https://sofifa.com/player/234043/fahad-al-abdu...,F. Al Abdulrahman,Fahad Ali Shonain Al Abdulrahman,LB,62,69,325000.0,2000.0,21,...,56+1,56+1,61+1,16+1,https://cdn.sofifa.net/players/234/043/17_120.png,https://cdn.sofifa.net/teams/2013/60.png,https://cdn.sofifa.net/flags/be.png,,https://cdn.sofifa.net/flags/qa.png,2017


In [75]:
"D. Clair"
players[(players["year"] == 2022) & (players["nationality_name"] == "Canada")]["long_name"].unique()

array(['Alphonso Boyle Davies', 'Jonathan Christian David',
       'Atiba Hutchinson', 'Milan Borjan', 'Stephen Antunes Eustáquio',
       'Cyle Christopher Larin', 'Scott Harry Nathaniel Arfield',
       'Lucas Daniel Cavallini', 'Maxime Crépeau', 'David Junior Hoilett',
       'Jonathan Osorio', 'Samuel Piette', 'Richie Mamah Laryea',
       'Mark-Anthony Kaye', 'Doneil Jor-Dee Ashley Henry',
       'David Wotherspoon', 'Samuel Ayomide Adekugbe',
       'Cristián Daniel Gutiérrez Zúñiga', 'Zachary Brault-Guillard',
       'Tajon Buchanan', 'Dayne St. Clair', 'Kamal Anthony Miller',
       'Tesho Akindele', 'Caniggia Ginola Elva', 'Russell Teibert',
       'Ricardo José Araújo Ferreira', 'Manjrekar James',
       'Liam Alan Millar', 'Ayomide Akinola', 'Alistair Johnston',
       'Scott Fitzgerald Kennedy', 'Steven de Sousa Vitória',
       'Tyler Geoffrey Pasher', 'Liam Fraser', 'Derek Cornelius',
       'Jayson Leutwiler', 'Raheem Edwards', 'Ashtone Morgan',
       'James Pantemis', 

In [67]:
temp_df["home_team"].unique()

array(['Cameroon', 'Sweden', 'Australia', 'Finland', 'Cape Verde',
       'Yemen', 'Chile', 'DR Congo', 'Timor-Leste', 'India', 'Brunei',
       'Mongolia', 'Nepal', 'Pakistan', 'Botswana', 'Denmark', 'Germany',
       'Scotland', 'Eswatini', 'France', 'Bulgaria', 'Netherlands',
       'Brazil', 'Dominica', 'Hungary', 'Republic of Ireland',
       'Northern Ireland', 'Italy', 'South Korea', 'Peru', 'Portugal',
       'Switzerland', 'United States', 'Lesotho', 'Bangladesh',
       'Singapore', 'Norway', 'Turkey', 'Laos', 'Mexico', 'Wales',
       'Argentina', 'Colombia', 'Russia', 'Slovenia', 'Ecuador',
       'Myanmar', 'Poland', 'Uruguay', 'Bolivia', 'Greece', 'Romania',
       'England', 'Venezuela', 'Djibouti', 'Czech Republic',
       'North Korea', 'Paraguay', 'Belgium', 'Spain', 'Austria',
       'Martinique', 'Catalonia', 'Maldives', 'Guadeloupe',
       'Sint Maarten', 'China PR', 'Canada', 'Seychelles', 'Galicia',
       'Corsica', 'Northern Mariana Islands', 'Taiwan', 'Tahiti

In [13]:
players.shape[1] + matches.shape[1]

121

In [7]:
pd.merge([players])

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url,year
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png,2022
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png,2022
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png,2022
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png,2022
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142074,220806,https://sofifa.com/player/220806/ellis-redman/...,E. Redman,Ellis Redman,CB,41,61,20000.0,2000.0,17,...,41,41,40,,https://cdn.sofifa.net/players/220/806/15_120.png,https://cdn.sofifa.net/teams/112254/60.png,https://cdn.sofifa.net/flags/gb-eng.png,,https://cdn.sofifa.net/flags/gb-wls.png,2015
142075,225509,https://sofifa.com/player/225509/aaron-collins...,A. Collins,Aaron Graham John Collins,ST,41,50,30000.0,2000.0,17,...,31,31,32,,https://cdn.sofifa.net/players/225/509/15_120.png,https://cdn.sofifa.net/teams/112254/60.png,https://cdn.sofifa.net/flags/gb-eng.png,,https://cdn.sofifa.net/flags/gb-wls.png,2015
142076,201197,https://sofifa.com/player/201197/paul-tisdale/...,P. Tisdale,Paul Tisdale,"CM, CAM",40,40,,2000.0,41,...,33,33,34,,https://cdn.sofifa.net/players/201/197/15_120.png,https://cdn.sofifa.net/teams/143/60.png,https://cdn.sofifa.net/flags/gb-eng.png,,https://cdn.sofifa.net/flags/mt.png,2015
142077,217591,https://sofifa.com/player/217591/piotr-zemlo/1...,P. Żemło,Piotr Żemło,"LM, LB",40,50,15000.0,2000.0,18,...,53-3,53-3,51-1,,https://cdn.sofifa.net/players/217/591/15_120.png,https://cdn.sofifa.net/teams/1873/60.png,https://cdn.sofifa.net/flags/pl.png,,https://cdn.sofifa.net/flags/pl.png,2015
