# Name matching
## context:
We want to try models using the players statistics of a game in order to predict that game's outcome. </br>
the players stats that we will use are the FIFA player's stats, found at [this url](https://www.kaggle.com/stefanoleone992/fifa-22-complete-player-dataset "Kaggle Fifa complete dataset").</br>
The games results with the players appearance can be found at [this url](https://data.world/dcereijo/player-scores).
This dataset includes 40k+ games with 20k+ players from 300+ clubs</br></br>
The main focus of this notebook is to link the players from the latter dataset to those in the FIFA dataset, in order to build our model training dataframe. </br>
This is not a trivial task as the names are not always properly registered or simply not registered the same way, are not present from one dataset to another, and several players have the exact same name or a very similar name.

In [1]:
!pip install unidecode -q
!pip install python-Levenshtein -q
!pip install fuzzywuzzy -q
!pip install "tqdm>=4.9.0"



In [1]:
import pandas as pd
import numpy as np
import unidecode
from fuzzywuzzy import fuzz #this amazing library uses levenshtein distance (string similarity scores) to match similar strings
import inspect
from tqdm import tqdm
import re
tqdm.pandas()
pd.options.display.max_columns = None

In [8]:
#Use this function into a function that is itself into a list comprehension to retrieve the name of a variable passed as an argument of the function
#to use this into a function, delete one '.f_back', to use it in the script, delete two
def retrieve_name_list(var):
    callers_local_vars = inspect.currentframe().f_back.f_back.f_back.f_locals.items()
    return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]

def retrieve_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_back.f_locals.items()
    return [var_name for var_name, var_val in callers_local_vars if var_val is var][0]

#example
a=2
b=3

def foo(bar):
    return retrieve_name_list(bar)

def foo2(bar):
    return retrieve_name(bar)

print(foo2(a))
[foo(x) for x in [a,b]]


a


['a', 'b']

# Data

In [9]:
#read data
#data source url: https://data.world/dcereijo/player-scores
appearance = pd.read_csv('https://query.data.world/s/xemgpklltd3hlau4swg2vafdctgacf')

In [10]:
clubs = pd.read_csv('https://query.data.world/s/bmpof22nmwcl7dc4s5kf5l2pjf6l62')

In [11]:
leagues = pd.read_csv('https://query.data.world/s/zmlqmpvqs4atuxn3rsdkdqv5wa6c5o')

In [12]:
games = pd.read_csv('https://query.data.world/s/ntedgrx2r6shpsvskopamknbnl7sfk')

In [13]:
players = pd.read_csv('https://query.data.world/s/jyeqrkxvhxmqxzqfac2s6kquuxrfuo')

In [14]:
games.season.unique()

array([2014, 2013, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [15]:
display(clubs.loc[clubs.name.str.contains('ogc-'),:])
clubs.loc[clubs.name.str.contains('ogc-'),'name'] = 'ogc-nice'
clubs.loc[clubs.name.str.contains('ogc-'),'pretty_name'] = 'OGC Nice'
clubs.loc[clubs.name.str.contains('ogc-'),:]

Unnamed: 0,club_id,name,pretty_name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,url
341,417,ogc-nizza,Ogc Nizza,FR1,212.45,27,24.2,18,66.7,8,Allianz Riviera,36178,£-22.32m,Christophe Galtier,https://www.transfermarkt.co.uk/ogc-nizza/star...


Unnamed: 0,club_id,name,pretty_name,domestic_competition_id,total_market_value,squad_size,average_age,foreigners_number,foreigners_percentage,national_team_players,stadium_name,stadium_seats,net_transfer_record,coach_name,url
341,417,ogc-nice,OGC Nice,FR1,212.45,27,24.2,18,66.7,8,Allianz Riviera,36178,£-22.32m,Christophe Galtier,https://www.transfermarkt.co.uk/ogc-nizza/star...


In [16]:
players.name = players.name.str.replace("-"," ").str.replace('dj','d')

In [17]:
fifa22 = pd.read_csv('FIFA_stats/FIFA22.csv')
fifa21 = pd.read_csv('FIFA_stats/FIFA21.csv')
fifa20 = pd.read_csv('FIFA_stats/FIFA20.csv')
fifa19 = pd.read_csv('FIFA_stats/FIFA19.csv')
fifa18 = pd.read_csv('FIFA_stats/FIFA18.csv')
fifa17 = pd.read_csv('FIFA_stats/FIFA17.csv')
fifa16 = pd.read_csv('FIFA_stats/FIFA16.csv')
fifa15 = pd.read_csv('FIFA_stats/FIFA15.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [18]:
#replace columns name for consistency
fifa22.rename(columns={'club_name' : 'club'}, inplace = True)
fifa21.rename(columns={'club_name' : 'club'}, inplace = True)

#set Nan values in the club columns to an unmistakingly special string in order to apply string operation on the columns
fifa22.loc[fifa22.club.apply(type) != str , 'club'] = 'ZZZZZZ'
fifa21.loc[fifa21.club.apply(type) != str , 'club'] = 'ZZZZZZ'

In [19]:
#removes diacritics, hyphens, apostrophes on player names and club. 
#"dj" is replaced to "d" to avoid recurrent ambiguity related to eastern european names
def decode_fifa_list(fifalist):
    for fifa_df in fifalist:
        fifa_df.short_name = fifa_df.short_name.apply(unidecode.unidecode)
        fifa_df.long_name = fifa_df.long_name.apply(unidecode.unidecode)
        fifa_df.club = fifa_df.club.apply(unidecode.unidecode)

        fifa_df.short_name = fifa_df.short_name.str.lower().str.replace("'","").str.replace("-"," ").str.replace('dj','d')
        fifa_df.long_name = fifa_df.long_name.str.lower().str.replace("'","").str.replace("-"," ").str.replace('dj','d')
        
        
    return None

fifa_list = [fifa22,fifa21,fifa20,fifa19,fifa18,fifa17,fifa16,fifa15]
decode_fifa_list(fifa_list)

In [20]:
#players name columns is already preprocessed, execpt for that 'dj' issue
players.name = players.name.str.replace('dj','d')

In [74]:
link=players.loc[:,['player_id','name']]
link['name_compare'] = None
link['fuzz_score'] = None
link.head(1)

Unnamed: 0,player_id,name,name_compare,fuzz_score
0,38790,dmitri golubov,,


In [75]:
#ajoute la liste des clubs avec lesquels les joueurs ont joué un match
link['clubs'] = link.player_id.progress_apply(lambda x:
                    appearance.loc[appearance.player_id == x,'player_club_id'].unique() if type(x) == int
                                     else [appearance.loc[appearance.player_id == y,'player_club_id'].unique() for y in x]
                                    )

100%|██████████| 22432/22432 [00:28<00:00, 794.57it/s]


In [76]:
# Supprime les joueurs n'ayant jamais joué un seul match (~4800 joueurs quand même)
print(link.loc[ link.clubs.apply(len) == 0 ,:].shape)
link.drop(link.loc[ link.clubs.apply(len) == 0 ,:].index, inplace = True)
link.reset_index(drop = True,inplace = True)

(4793, 5)


In [77]:
#returns a list of the best matching names index, the corresponding fuzz.token_set_ratio max value and the corresponding names
def search_name(x,fifa_N):
    m = fifa_N.long_name.apply(lambda y: fuzz.token_set_ratio(x,y)).values   
    #each line in the fifa dataset is assign a similatiy score with the selected name from the players dataset
    
    return (n := np.argwhere(m == m.max()).squeeze()) , m.max()#, fifa_N.long_name[n] 
#returns 3 values: 1. the index of the best matching name(s) 2. the corresponding similarity score 3. the corresponding names

In [78]:
def search_all_names(fifa_N):
    k = link.name.progress_apply(lambda x:search_name(x,fifa_N))
    display(k.head(10))
    link.loc[:,retrieve_name(fifa_N)] = k.apply(lambda x : x[0])
    link.fuzz_score = k.apply(lambda x : x[1])
    #link.iloc[:10,:].name_compare = k.apply(lambda x : x[2])
    return link.loc[link.fuzz_score > 90,'fuzz_score'].shape



In [42]:
#applies the above functions to a fifa dataset.
#### WARNING, this takes ~1h to compute ####
[search_all_names(x) for x in [fifa21]]
link.to_csv('fifa21_res.csv')

100%|██████████| 17639/17639 [1:23:18<00:00,  3.53it/s]


0                            ([1694, 4286], 65)
1                            ([2434, 3514], 80)
2                                   (17443, 71)
3                                   (15840, 73)
4                                     (831, 69)
5                                    (3003, 75)
6                                    (4842, 62)
7    ([187, 316, 3650, 9040, 11898, 16374], 70)
8                                     (463, 71)
9                                   (10388, 79)
Name: name, dtype: object

In [None]:
search_all_names(fifa20)
link.to_csv('fifa20_res.csv')

 50%|████▉     | 8741/17639 [39:19<39:39,  3.74it/s]  

In [None]:
search_all_names(fifa19)
link.to_csv('fifa19_res.csv')

In [None]:
search_all_names(fifa18)
link.to_csv('fifa18_res.csv')

In [None]:
search_all_names(fifa17)
link.to_csv('fifa17_res.csv')

In [83]:
#link.to_csv('fifa22_res.csv')

## name filtering
#### 1. remove names with similarity score <90, which are unlikely to be correct

In [47]:
link = pd.read_csv('fifa21_res.csv', index_col=0)
link.head(1)

Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21
0,38790,dmitri golubov,,65,[28095],[1694 4286]


In [48]:
#link.drop(columns = ['Unnamed: 0','Unnamed: 0.1'], inplace = True)
print(link.shape)
link = link.loc[link.fuzz_score > 90,:]
link.reset_index(drop = True,inplace = True)
print(link.shape)
link.head()

(17639, 6)
(7948, 6)


Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21
0,73048,ivan martic,,100,[276],4665
1,37633,guillermo rodriguez,,100,[276],11159
2,54921,massimo volta,,100,[1429],6417
3,108198,sebastian ernst,,100,[42],7828
4,110885,florian ballas,,100,[42],6736


#### Reading a .csv tranforms lists into strings. next we apply some preprocessing to transform string of list into list again 

In [50]:
import ast
def transform_int_column(x):
    if x[0] == r'[':
        res = re.sub(r'(\d)\s+(\d)',r'\1 , \2',x.replace('\n',''))
        res = ast.literal_eval(res) #tranforme un string d'une liste en liste
        res = [int(y) for y in res]
    else:
        res = int(x)
    return res
    
link.fifa21 = link.fifa21.apply(transform_int_column)

In [51]:
link.loc[link.fifa21.apply(type)!=int,'name_compare'] = \
link.loc[link.fifa21.apply(type)!=int,'fifa21'].apply(lambda x : [fifa21.long_name[y]for y in x])


In [52]:
link['clubs'] = link.player_id.progress_apply(lambda x:
                    appearance.loc[appearance.player_id == x,'player_club_id'].unique() if type(x) == int
                                     else [appearance.loc[appearance.player_id == y,'player_club_id'].unique() for y in x]
                                    )

100%|██████████| 7948/7948 [00:10<00:00, 760.93it/s]


In [54]:
#let's append the club from the players dataset to the clubs obtained from appearance dataset, if that club is not already in that list
link.clubs = link.progress_apply(lambda x: 
                                 np.append(x['clubs'],players.loc[players.player_id == x['player_id'],'current_club_id']) 
                                 if players.loc[players.player_id == x['player_id'],'current_club_id'].values[0] not in x['clubs']
                                 else x['clubs']
                                 ,axis = 1)


100%|██████████| 7948/7948 [00:04<00:00, 1655.47it/s]


#### 2. for names with multiple matches, we can check the clubs in which the player have played

We will compare the club names of the fifa 'candidates' to the list of clubs we can somehow link to the target player:</br>
from the appearance dataset, we have every player for every games with the club ID of those players. Therefore, we can have a partial player's club appartenance history, given that the player has played games...</br>
We can also add to that list the players current club found the player's dataset</br>
It is assumed that a club having multiple players with the very same name is very unlikely

In [56]:
# This is our way of accessing the names with multiple matches. These are pandas Series type, we will change the into numpy arrays
link.loc[link.fifa21.apply(type)!=int,:].head(5)

Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21
8,49899,william,"[raphael william anjos rochedo, daniel william...",100,[28095],"[268, 410, 438, 1051, 1424, 2468, 2775, 2785, ..."
10,23345,barreto,"[diogo nathan peixe barreto, michael barreto, ...",100,[416],"[5975, 6580, 6748, 9159, 11470, 12453, 16334, ..."
12,167501,fabio tavares,"[fabio henrique tavares, fabio tavares]",100,"[3060, 5219]","[41, 17891]"
14,218470,cassio,"[joelinton cassio apolinario de lira, cassio a...",100,[21957],"[1838, 2062, 15376]"
15,130963,rudy,"[sebastian rudy, rudy alejandro cardozo fernan...",100,[128],"[950, 4857, 8709]"


In [57]:
#Replace club_id with club names
link.clubs = link.clubs.progress_apply(lambda x:
                                      [clubs.loc[clubs.club_id == y, 'name'] for y in x])
link.clubs = link.clubs.apply(lambda x: list(np.concatenate(x).flat))


100%|██████████| 7948/7948 [00:06<00:00, 1312.87it/s]


In [58]:
#adds the clubs corresponding to the FIFA indexes
link['fifa21_clubs'] = link.fifa21.progress_apply(lambda x : 
                                         fifa21.loc[x,'club']
                                        )

100%|██████████| 7948/7948 [00:00<00:00, 28578.23it/s]


In [59]:
#last cell created pandas series where we would have preffered arrays. Let's fix this:
link.loc[link.fifa21_clubs.apply(type) != str,'fifa21_clubs'] = link.loc[link.fifa21_clubs.apply(type) != str,'fifa21_clubs'].apply(lambda x : x.values)
link.loc[link.fifa21_clubs.apply(type) != str,:].head(1)


Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21,fifa21_clubs
8,49899,william,"[raphael william anjos rochedo, daniel william...",100,[fk-ufa],"[268, 410, 438, 1051, 1424, 2468, 2775, 2785, ...","[Gremio, Southampton, Real Betis Balompie, Cel..."


In [60]:
print(link.loc[(link.clubs.apply(len) == 0)&(link.name_compare.apply(type)!= str),:].shape)
print(link.loc[(link.clubs.apply(len) == 0)&(link.fuzz_score<100),:].shape)

#there are some club_id that cannot be found in the clubs dataset
#Remove rows that have no clubs and have a similarity score inferior to 100 (max similarity), and multiple matches.

link.drop(link.loc[(link.clubs.apply(len) == 0)&(link.name_compare.apply(type)!= str),:].index, inplace = True)
link.drop(link.loc[(link.clubs.apply(len) == 0)&(link.fuzz_score<100),:].index, inplace = True)


print(link.loc[(link.clubs.apply(len) == 0)&(link.name_compare.apply(type)!= str),:].shape)
print(link.loc[(link.clubs.apply(len) == 0)&(link.fuzz_score<100),:].shape)

(0, 7)
(0, 7)
(0, 7)
(0, 7)


<b> The next 2 cells goal is to check the name similarity between 2 club lists.</br>
    - The first cell returns a LIST of the max similarity score between all of fifa players clubs and ANY of the clubs in which the "target" player has played.</br>
    - The second cell returns the position of the max similarity score in that list, if this score is >90. This should be the right player.


In [65]:
link['club_corr'] = None


In [66]:
link.loc[link.fifa21.apply(type) != int,'club_corr'] = link.loc[link.fifa21.apply(type) != int,:].progress_apply(lambda x:
                                                   [max([fuzz.token_set_ratio(z,y) for z in x['clubs']]) for y in x['fifa21_clubs']]
                                                   ,axis = 1
                                                  )            

100%|██████████| 673/673 [00:00<00:00, 2135.63it/s]


In [67]:
link.loc[link.fifa21.apply(type) != int,'club_corr'] = link.loc[link.fifa21.apply(type) != int,:].club_corr.apply(lambda x : 
                     np.argmax(x) if max(x)>90 else None
                     )

<b> At this point, the names with multiple matches and no corresponding club names can be removed

In [68]:
link_fifa21 = link.loc[(link.club_corr.notnull()) | (link.fifa21.apply(type) == int),:].copy()

In [70]:
link_fifa21.loc[~link_fifa21.club_corr.isnull(),:].head(2)

Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21,fifa21_clubs,club_corr
491,24633,eduardo,"[eduardo antonio salvio, joao mario naval da c...",100,[shakhtar-donetsk],"[289, 449, 689, 1167, 1275, 1428, 1493, 1617, ...","[Boca Juniors, Inter, SD Huesca, Sporting CP, ...",27
541,238717,pablo perez,"[pablo javier perez, pablo perez rodriguez]",100,[sporting-gijon],"[2800, 8908]","[Newell's Old Boys, Real Sporting de Gijon]",1


In [71]:
link_fifa21.loc[~link_fifa21.club_corr.isnull(),'fifa21'] = \
link_fifa21.loc[~link_fifa21.club_corr.isnull(),:].apply(lambda x:
                                                         x['fifa21'][int(x['club_corr'])]
                                                         ,axis = 1)

link_fifa21.loc[~link_fifa21.club_corr.isnull(),'name_compare'] = \
link_fifa21.loc[~link_fifa21.club_corr.isnull(),:].apply(lambda x:
                                                         x['name_compare'][int(x['club_corr'])]
                                                         ,axis = 1)

link_fifa21.loc[~link_fifa21.club_corr.isnull(),'fifa21_clubs'] = \
link_fifa21.loc[~link_fifa21.club_corr.isnull(),:].apply(lambda x:
                                                         x['fifa21_clubs'][int(x['club_corr'])]
                                                         ,axis = 1)

link_fifa21.loc[~link_fifa21.club_corr.isnull(),:].head(2)

Unnamed: 0,player_id,name,name_compare,fuzz_score,clubs,fifa21,fifa21_clubs,club_corr
491,24633,eduardo,vitor eduardo da silva matos,100,[shakhtar-donetsk],6311,Shakhtar Donetsk,27
541,238717,pablo perez,pablo perez rodriguez,100,[sporting-gijon],8908,Real Sporting de Gijon,1


In [72]:
link_fifa21.to_csv('link_fifa21.csv')

In [73]:
link_fifa21.shape

(7517, 8)

In [351]:
gameID = games.game_id[35]
game_appearance= appearance.loc[appearance.game_id == gameID,:]
game_appearance_playerID = appearance.loc[appearance.game_id == gameID,'player_id'].values
game_appearance.apply(lambda x :
                      x['player_id'] in link_fifa22.player_id.values
                     , axis = 1
                     ).value_counts()[0]

1

In [358]:
appearance.head(1)

Unnamed: 0,player_id,game_id,appearance_id,competition_id,player_club_id,goals,assists,minutes_played,yellow_cards,red_cards
0,52453,2483937.0,2483937_52453,RU1,28095,0,0,90,0,0


In [362]:
tt = appearance.loc[:,['game_id','competition_id']].groupby('game_id').count()

In [367]:
tt.loc[tt.competition_id>22,:]

Unnamed: 0_level_0,competition_id
game_id,Unnamed: 1_level_1
2457642.0,27
2458528.0,28
2458586.0,33
2459586.0,28
2459587.0,28
...,...
3694723.0,31
3700967.0,32
3700970.0,31
3700972.0,31


In [337]:
#Some games are not in appearance: Almost 4k out of 40k

print(games.shape)

games_id = games.game_id
appearance_id = appearance.game_id.unique()
games.loc[games_id.apply(lambda x : x not in appearance_id),:].shape

(42592, 15)


(3798, 15)

In [339]:
#let's just get rid of those games
games.drop(games.loc[games_id.apply(lambda x : x not in appearance_id),:].index, inplace = True)
games.reset_index(drop = True,inplace = True)
games.shape

(38794, 15)

In [352]:
g = games.game_id.progress_apply(lambda y:
                                 appearance.loc[appearance.game_id == y,:].apply(lambda x :
                                                                                 x['player_id'] in link_fifa22.player_id.values
                                                                                 , axis = 1).value_counts()[0]
                                )

  0%|          | 35/38794 [00:00<02:56, 219.37it/s]


KeyError: 0