The notebooks combines the data. It excludes anyone who has played fewer than 675 minutes. We exclude people who are no longer in the in scope league.

In [None]:
import pandas as pd
import numpy as np
import os
import fuzzymatcher
from scrape_utils import get_fbref_player_dob
import tqdm
import random
import time

In [None]:
translations = str.maketrans({'á': 'a',
                              'ý': 'y',
                              'š': 's',
                              'Â': 'A',
                              'ó': 'o',
                              'í': 'i',
                              'Č': 'c',
                              'ç': 'c',
                              'ú': 'u',
                              'é': 'e',
                              'ć': 'c',
                              'ñ': 'n',
                              'Đ': 'D',
                              'ê': 'e',
                              'Ž': 'Z',
                              'ł': 'l',
                              'É': 'E',
                              'ã': 'a',
                              'Ł': 'L',
                              'Á': 'A',
                              'ž': 'z',
                              'ø': 'o',
                              'ź': 'z',
                              'č': 'c'})

# Loading fbref data

In [None]:
df = pd.read_parquet(os.path.join('data', 'fbref_clean', 'fbref_combined.parquet'))
df['player_name'] = df.player.str.replace("'", "")
# change team names with similar names in different leagues
df.loc[((df['squad'] == 'Vitória') &
        (df['league'] == 'brazil1')),'squad'] = 'Esporte Clube Vitória'
df.loc[((df['squad'] == 'Vitória') &
        (df['league'] == 'portugal1')),'squad'] = 'Vitória Guimarães SC'
df.loc[(df.squad == 'Atlético') & (df.league == 'mexico1'), 'squad'] = 'Atlético de San Luis'
df['player_name'] = df['player_name'].str.translate(translations)

# Filter players with 7.5 90s or over (675 minutes). Or who didn't play in the last season in the in scope leagues

In [None]:
df = df[(df['playing_time_min'] >= 675) & (df['year'] == 2024)].copy()

# Loading transfermarkt data

In [None]:
df_tm = pd.read_parquet(os.path.join('data', 'transfermarkt_clean', 'players_transfermarket.parquet'))
df_tm['born'] = df_tm['dob'].dt.year
df_tm['player_name'] = df_tm['player'].str.replace("'", "")
df_tm['player_name'] = df_tm['player_name'].str.translate(translations)

# Deduplicate transfermarkt data

In [None]:
df_tm.sort_values('joined', inplace=True)
df_tm.drop_duplicates('player_url', inplace=True, keep='last')

# Get common squad name

In [None]:
team_replace = {'Borussia Mönchengladbach': "M'gladbach",
                'Stade Brestois 29': 'Brest',
                'Stade Rennais FC': 'Rennes',
                'Queens Park Rangers': 'QPR',
                'CA Boca Juniors': 'Boca Juniors',
                'Manchester United': 'Manchester Utd',
                'Racing Santander': 'Racing Sant',
                'New York City FC': 'NYCFC',
                'AA Argentinos Juniors': 'Arg Juniors',
                'Club Athletico Paranaense': 'Ath Paranaense',
                'Paris Saint-Germain': 'Paris S-G',
                'Club Atlético Platense': 'Platense',
                'West Bromwich Albion': 'West Brom',
                'Sheffield Wednesday': 'Sheffield Weds',
                'Sheffield United': 'Sheffield Utd',
                'West Ham United': 'West Ham',
                'Philadelphia Union': 'Philadelphia',
                'Wolverhampton Wanderers': 'Wolves',
                'Los Angeles FC': 'LAFC',
                'Sporting Kansas City': 'Sporting KC',
                'Club Atlético Belgrano': 'Belgrano',
                '1.FC Nuremberg': 'Nürnberg',
                'Club Atlético Tigre': 'Tigre',
                'Atlético de Madrid': 'Atlético Madrid',
                'Nottingham Forest': "Nott'ham Forest",
                'Athletic Bilbao': 'Athletic Club',
                'Real Salt Lake City': 'RSL',
               }
# replacing some team names
df_tm['team_name'] = df_tm['team_name'].replace(team_replace)
df_team1 = df[['squad']].drop_duplicates().reset_index(drop=True)
df_team2 = df_tm[['team_name']].drop_duplicates().reset_index(drop=True)
df_team_match = fuzzymatcher.fuzzy_left_join(df_team1,
                                             df_team2,
                                             left_on=['squad'],
                                             right_on=['team_name'])
print('Check no team duplicates:', df_team_match.team_name.duplicated(keep=False).sum(),
      df_team_match.squad.duplicated(keep=False).sum())
print('Check number of teams', df_team_match.team_name.nunique(), df_team_match.squad.nunique())
df_tm = df_tm.merge(df_team_match[['squad', 'team_name']], on='team_name')

# Matching fbref data to transfermarkt data

In [None]:
df_player_match = fuzzymatcher.fuzzy_left_join(df[['player_link',
                                                   'player_name', 'born', 'squad']],
                                               df_tm[['player_url', 'player_name', 
                                                      'born', 'squad']],
                                               left_on=['player_name', 'born'],
                                               right_on=['player_name', 'born'],
                                               left_id_col='player_link',
                                               right_id_col='player_url'
                                              )
# rules for splitting matches and non-matches
mask_match1 = ((df_player_match.squad_left == df_player_match.squad_right) & 
               ((abs(df_player_match.born_left - df_player_match.born_right) <= 4) |
                (df_player_match.born_left.isnull())) &
               (df_player_match.best_match_score > -0.04))
mask_match2 = ((df_player_match.squad_left != df_player_match.squad_right) & 
               (df_player_match.born_left == df_player_match.born_right) &
               (df_player_match.best_match_score >= 0.1))
# found a few errors when checking
mismatched = ['/en/players/916728da/Diego-Gomez', 
              '/en/players/6388eae1/Luismi',
              '/en/players/185084a1/Luiz-Felipe',
              '/en/players/63969d01/Marlon',
              '/en/players/06d4b2e4/Vitinho',
              '/en/players/74d4cec6/Vitinha',
              '/en/players/02bcaf8e/Bilal-Brahimi',
              '/en/players/c9ebe876/Dudu',
              '/en/players/35f270b7/Mateus-Cocao',
              '/en/players/5475a510/Gabriel-Bares',
             ]
matched = df_player_match[(mask_match1 | mask_match2) &
                          (~df_player_match['player_link'].isin(mismatched))].copy()
matched.sort_values('best_match_score', inplace=True)
matched.drop_duplicates('__id_right', keep='last', inplace=True)
not_matched = df[~df.player_link.isin(matched.__id_left)]
print('Number matched:', len(matched), 'Number unmatched:', len(not_matched))
print('Check no player duplicates:', matched.__id_left.duplicated(keep=False).sum(),
      matched.__id_right.duplicated(keep=False).sum())

# For unmatched data matching get the more detailed biographical data from fbref (player name and dob)

In [None]:
links = not_matched.player_link.tolist()
links2 = [f'https://fbref.com{s}' for s in links]

In [None]:
# commented out as only need to run once
#names_list = []
#dob_list = []
#squad_list = []
#for url in tqdm.tqdm(links2):
#    name, dob, squad = get_fbref_player_dob(url)
#    names_list.append(name)
#    dob_list.append(dob)
#    squad_list.append(squad)
#    time.sleep(random.randint(10, 30))
#df_name_dob = pd.DataFrame({'player_link': links,
#                            'player': names_list,
#                            'dob': dob_list,
#                            'squad': squad_list})
#df_name_dob.to_parquet(os.path.join('data', 'fbref', 'fbref_name_dob.parquet'))

In [None]:
# if this fails you probably need to run the cell above without comments
df_name_dob = pd.read_parquet(os.path.join('data', 'fbref', 'fbref_name_dob.parquet'))

# Merge on the more detailed biographical data

In [None]:
# merging based on player name and date of birth
df_player_match2 = fuzzymatcher.fuzzy_left_join(df_name_dob,
                                                df_tm[['player_url', 'player_name', 
                                                      'dob', 'squad']],
                                                left_on=['player', 'dob'],
                                                right_on=['player_name', 'dob'],
                                                left_id_col='player_link',
                                                right_id_col='player_url')
# rules for splitting matches and non-matches
mask_match = ((((df_player_match2['dob_left'] == df_player_match2['dob_right']) &
              (df_player_match2['best_match_score'] > 0)) |
              ((df_player_match2['dob_left'] == df_player_match2['dob_right']) &
               (df_player_match2['squad_left'] == df_player_match2['squad_right']))) &
               # mismatched
               (~df_player_match2['player'].isin(['William Sands',
                                                  'Gonzalo Requena',
                                                  'Frankie Amaya']))
              )
matched2 = df_player_match2[mask_match].copy()
not_matched2 = df_player_match2[~mask_match].copy()
print('Number matched:', len(matched2), 'Number unmatched:', len(not_matched2))

# Manually get unmatched players

In [None]:
fb_links = ['/en/players/0841f1d8/Jorge-Zarfino',
            '/en/players/d4808594/Jon-Magunacelaya',
            '/en/players/f4187ac3/Leonel-Picco',
            '/en/players/bae360fd/Gyan-de-Regt',
            '/en/players/d21268c5/Mattia-Maita',
            '/en/players/c533d4d3/Anthony-Roncaglia',
            '/en/players/3828f130/Javier-Guerra',
            '/en/players/640b4170/Alejandro-Sancris',
            '/en/players/8b364abd/Igor-Marques',
            '/en/players/db867065/Hildeberto-Pereira',
            '/en/players/68796d61/Shinji-Okazaki',
            '/en/players/868c0f75/Marcelo-Barovero',
            '/en/players/71c980b7/Zaid-Romero',
            '/en/players/6ebe6cc1/Luan-Garcia',
            '/en/players/d0f01217/Jose-Hugo',
            '/en/players/960a4473/William-Sands',
            '/en/players/4292d55e/Tomas-Fernandez',
            '/en/players/cc5c5985/Joao-Pedro',
            '/en/players/cdf65d12/Marlon',
            '/en/players/d75471eb/Joao-Pedro-Pepe',
            '/en/players/188537d3/Aboubakary-Kante',
            '/en/players/39e9c09a/Walter-Montoya',
            '/en/players/0a7d98ae/Santiago-Lopez',
            '/en/players/abe841b4/Angelo-Marchese',
            '/en/players/0f978661/Gonzalo-Requena',
            '/en/players/14b61b88/Rodrigo-Saravia',
            '/en/players/8c4e56cb/Francisco-Jesus-Crespo-Garcia',
            '/en/players/4abe30fa/Fidel-Barajas',
            '/en/players/6fefa986/Carlos-Eduardo',
            '/en/players/d9b9b71a/Eulanio-Angelo-Chipela-Gomes',
            '/en/players/2083298b/Alix-Vinicius',
            '/en/players/80dbbd10/Simeon-Nwankwo',
            '/en/players/218d336c/Dairon-Asprilla',
            '/en/players/a1216563/Emiliano-Rigoni',
            '/en/players/56fca564/Daniel-Bandeira',
            '/en/players/8d3d3b0a/Giorgos-Giakoumakis',
            '/en/players/4f8a9f70/Kervin-Arriaga',
            '/en/players/3ea1862e/Miguel-Magalhaes',
            '/en/players/91c2ee76/Matheusinho',
            '/en/players/7687b9dd/Jo',
            '/en/players/3e9ae1db/Julian-Carranza',
            '/en/players/0db5d2c8/Mathias-Jorgensen',
            '/en/players/cfb29823/Danny-Ward',
            '/en/players/071b6ba9/Goncalo-Rodrigues',
            '/en/players/ebbc1dfb/Ricardo-Guimaraes',
            '/en/players/3778339c/Aidan-Morris',
            '/en/players/7942a27b/Mascarenhas',
            '/en/players/b333b014/Jose-Luis-Rodriguez',
            '/en/players/2e8bd00c/Frankie-Amaya',
            '/en/players/0a9a98f7/Jose-Garcia',
            '/en/players/2c56a792/Nicolas-Gonzalez',
            '/en/players/4c2a65a8/Manuel-Manu',
            '/en/players/4e2da7f7/Jose-Carlos',
            '/en/players/a2b25407/Fawaz-Al-Sqoor',
            '/en/players/2475cf9b/Juan-Brandariz',
            '/en/players/e3510f7e/David-Pastor',
            '/en/players/70cf63ca/Rafa-Silva',
            '/en/players/6a8a7af8/Vinicius-Souza',
            '/en/players/3024e383/Jose-Angel',
            '/en/players/4d395e2b/Jack-Rudoni',
            '/en/players/27dc1378/Jose-Luis-Garcia-Vaya',
            '/en/players/12207ec1/Gonzalo-Escobar',
            '/en/players/91e7fc9a/Matias-Garcia',
            '/en/players/b7b6ff9b/Nunez',
            '/en/players/06d4b2e4/Vitinho',
            '/en/players/74d4cec6/Vitinha',
            '/en/players/c9ebe876/Dudu',
           ]
tm_links = ['/giovanni-zarfino/profil/spieler/208022',
            '/jon-magunazelaia/profil/spieler/625655',
            'leonel-picco/profil/spieler/663077',
            '/gyan-de-regt/profil/spieler/747243',
            '/mattia-maita/profil/spieler/167470',
            '/anthony-roncaglia/profil/spieler/651979',
            '/javi-guerra/profil/spieler/834764',
            '/alex-sancris/profil/spieler/560705',
            '/igor-formiga/profil/spieler/688690',
            '/berto/profil/spieler/290384',
            '/shinji-okazaki/profil/spieler/79642',
            '/marcelo-barovero/profil/spieler/55142',
            '/zaid-romero/profil/spieler/741142',
            '/luan/profil/spieler/178984',
            '/ze-hugo/profil/spieler/940479',
            '/will-sands/profil/spieler/393327',
            '/tomas-fernandez/profil/spieler/613364',
            '/jptm/profil/spieler/1083970',
            '/marlon/profil/spieler/273236',
            '/pepe/profil/spieler/606716',
            '/abou-kante/profil/spieler/315095',
            '/walter-montoya/profil/spieler/267420',
            '/santiago-lopez/profil/spieler/1000687',
            '/angelo-marchese/profil/spieler/1264276',
            '/gonzalo-requena/profil/spieler/1120902',
            '/rodrigo-saravia/profil/spieler/750753',
            '/pejino/profil/spieler/534379',
            '/fidel-barajas/profil/spieler/994267',
            '/cadu/profil/spieler/1083073',
            '/nanu/profil/spieler/291399',
            '/alix/profil/spieler/880217',
            '/simy/profil/spieler/194549',
            '/dairon-asprilla/profil/spieler/260381',
            '/emiliano-rigoni/profil/spieler/282544',
            '/dani-silva/profil/spieler/372445',
            '/georgios-giakoumakis/profil/spieler/234850',
            '/kervin-arriaga/profil/spieler/526050',
            '/maga/profil/spieler/729140',
            '/matheuzinho/profil/spieler/365368',
            '/jotm/profil/spieler/401931',
            '/julian-carranza/profil/spieler/491707',
            '/zanka/profil/spieler/52059',
            '/danny-ward/profil/spieler/124172',
            '/guga/profil/spieler/258034',
            '/guima/profil/spieler/305425',
            '/aidan-morris/profil/spieler/513968',
            '/masca/profil/spieler/606720',
            '/puma/profil/spieler/425028',
            '/frankie-amaya/profil/spieler/575368',
            '/jose-juan-manriquez/profil/spieler/352552',
            '/nico-gonzalez/profil/spieler/466805',
            '/manu/profil/spieler/699592',
            '/ze-carlos/profil/spieler/617679',
            '/fawaz-al-sqoor/profil/spieler/364450',
            '/chumi/profil/spieler/333361',
            '/pastor/profil/spieler/740498',
            '/rafa/profil/spieler/238055',
            '/vini-souza/profil/spieler/663581',
            '/cote/profil/spieler/87469',
            '/jack-rudoni/profil/spieler/662476',
            '/pepelu/profil/spieler/328480',
            '/gonzalo-escobar/profil/spieler/440377',
            '/matias-garcia/profil/spieler/267393',
            '/alvaro-nunez/profil/spieler/626953',
            '/vitinho/profil/spieler/670965',
            '/vitinha/profil/spieler/586853',
            '/dudu/profil/spieler/691811',
           ]
matched3 = pd.DataFrame({'player_link': fb_links, 'player_url': tm_links})

# Combine matches into a single table of links

In [None]:
all_matched = pd.concat([matched3,
                         matched2[['player_link', 'player_url']],
                         matched[['player_link', 'player_url']]])
all_matched.rename({'player_link': 'player_link_fbref',
                    'player_url': 'player_link_tm'},
                   axis='columns', inplace=True)
print('Check no duplicated fbref links:',
      all_matched.duplicated('player_link_fbref', keep=False).sum())
print('Check no duplicated transfermarkt links:',
      all_matched.duplicated('player_link_tm', keep=False).sum())
print('Number of matched:', len(all_matched), '/ Number of original fbref:', len(df))

# Filter matched datasets

In [None]:
df_tm = df_tm[df_tm.player_url.isin(all_matched.player_link_tm)].copy()
df = df[df.player_link.isin(all_matched.player_link_fbref)].copy()
print(len(df_tm), len(df)) 
# note a few players have moved out of the leagues so aren't in the transfermarkt data any more

# Calculate age at 1st July 2024

In [None]:
df_tm['age_at_2024_07_01'] =  np.round((pd.to_datetime('2024-07-01') - df_tm.dob).dt.days / 365.25, 1)

# Format and save dataframe

In [None]:
df_tm.drop(['team_name', 'player_name', 'born', 
            'player', 'contract', 'year', 'current_club'], axis='columns', inplace=True)
df_tm.rename({'player_url': 'player_link'}, axis='columns', inplace=True)
df_tm.columns = [c+'_tm' for c in df_tm.columns]
df_tm = df_tm.merge(all_matched, how='inner', validate='1:1', on='player_link_tm')
df.drop('player_name', axis='columns', inplace=True)
df.rename({'player_link': 'player_link_fbref'}, axis='columns', inplace=True)
df = df.merge(df_tm, how='inner', on='player_link_fbref', validate='1:1')
df.to_parquet(os.path.join('data', 'fbref_tm_combined.parquet'))

In [None]:
df.info(verbose=True, show_counts=True)