In [None]:
import pandas as pd
import numpy as np
import os
import fuzzymatcher
from scrape_utils import get_fbref_player_dob
import time

# Loading fbref data

In [None]:
df = pd.read_parquet(os.path.join('data', 'fbref_combined.parquet'))
df['player_name'] = df.player.str.replace("'", "")

# Loading transfermarkt data

In [None]:
df_tm = pd.read_parquet(os.path.join('data', 'players_transfermarket.parquet'))
df_tm['born'] = df_tm.dob.dt.year
df_tm['player_name'] = df_tm.player.str.replace("'", "")
df_tm_2020 = df_tm[df_tm.year == '2020'].copy()
df_tm = df_tm[df_tm.year == '2019'].copy()

# Deduplicate transfermarkt data

In [None]:
df_tm.sort_values('joined', inplace=True)
df_tm.drop_duplicates('player_link', inplace=True, keep='last')

# Adding on fbref squad name to transfermarkt data

In [None]:
# replacing some team names
team_replace = {'Borussia Mönchengladbach': "M'gladbach",
                'Stade Rennais FC': 'Rennes',
                'AC Milan': 'Milan',
                'Inter Milan': 'Inter',
                'Wolverhampton Wanderers': 'Wolves'}
df_tm.team_name.replace(team_replace, inplace=True)
# matching team names
df_team1 = pd.DataFrame(df.squad.drop_duplicates().reset_index(drop=True))
df_team2 = pd.DataFrame(df_tm.team_name.drop_duplicates().reset_index(drop=True))
df_team_match = fuzzymatcher.fuzzy_left_join(df_team1,
                                             df_team2,
                                             left_on=['squad'],
                                             right_on=['team_name'])
df_team_match = df_team_match[['team_name', 'squad']].copy()
# add on squad name to transfermarkt data
df_tm = df_tm.merge(df_team_match, on='team_name')
df_tm_2020 = df_tm_2020.merge(df_team_match, on='team_name')

# Matching fbref top-5 table to transfermarkt data

In [None]:
# merging based on player name and year of birth
df_player_match = fuzzymatcher.fuzzy_left_join(df[['player_name', 'player_link', 'pos', 'squad', 'age', 'born']],
                                               df_tm,
                                               left_on=['player_name', 'born'],
                                               right_on=['player_name', 'born'],
                                               left_id_col='player_link',
                                               right_id_col='player_link')
# rules for splitting matches and non-matches
mask_match1 = ((df_player_match.squad_left == df_player_match.squad_right) & 
               ((abs(df_player_match.born_left - df_player_match.born_right) <= 4) | (df_player_match.born_left.isnull())) &
               (df_player_match.best_match_score > -0.04))
mask_match2 = ((df_player_match.squad_left != df_player_match.squad_right) & 
               (df_player_match.born_left == df_player_match.born_right) &
               (df_player_match.best_match_score >= 0.1))
matched = df_player_match[mask_match1 | mask_match2].copy()
matched.sort_values('best_match_score', inplace=True)
matched.drop_duplicates('__id_right', keep='last', inplace=True)
not_matched = df[~df.player_link.isin(matched.__id_left)]
print('Number matched:', len(matched), 'Number unmatched:', len(not_matched))

# For unmatched data matching get the more detailed biographical data from fbref (player name and dob)

In [None]:
#links = not_matched.player_link.tolist()
#links2 = [f'https://fbref.com{s}' for s in links]

In [None]:
#names_list = []
#dob_list = []
#for url in links2:
#    name, dob = get_fbref_player_dob(url)
#    names_list.append(name)
#    dob_list.append(dob)
#    time.sleep(30)
#df_name_dob = pd.DataFrame({'player_link': links, 'player': names_list, 'dob': dob_list})
#df_name_dob.to_parquet(os.path.join('data', 'fbref_name_dob.parquet'))

In [None]:
df_name_dob = pd.read_parquet(os.path.join('data', 'fbref_name_dob.parquet'))

# Merge on the more detailed biographical data

In [None]:
# merging based on player name and date of birth
df_player_match2 = fuzzymatcher.fuzzy_left_join(df_name_dob,
                                                df_tm,
                                                left_on=['player', 'dob'],
                                                right_on=['player_name', 'dob'],
                                                left_id_col='player_link',
                                                right_id_col='player_link')
# rules for splitting matches and non-matches
mask_match = (df_player_match2['best_match_score'] > 0) & (df_player_match2['dob_left'] == df_player_match2['dob_right'])
matched2 = df_player_match2[mask_match].copy()
not_matched2 = df_player_match2[~mask_match].copy()
print('Number matched:', len(matched2), 'Number unmatched:', len(not_matched2))

# I have manually found the links for the remaining (9) unmatched players through google search/ str lookups

In [None]:
fb_links = ['/en/players/31a1288d/Antonio-Cortes', '/en/players/3423f250/Raphael-Dias-Belloli',
            '/en/players/af245161/Fabricio', '/en/players/189cee7b/Javier-Hernandez',
            '/en/players/0eb03d5b/Vukasin-Jovanovic', '/en/players/0db5d2c8/Mathias-Jorgensen',
            '/en/players/c6e8cf1f/Sasa-Lukic', '/en/players/79443529/Dusan-Vlahovic', '/en/players/3ae14ed1/Trezeguet',
            '/en/players/e7fcf289/Florian-Wirtz',]
tm_links = ['/antonin/profil/spieler/610336', '/raphinha/profil/spieler/411295',
            '/fabri/profil/spieler/45882', '/chicharito/profil/spieler/50935',
            '/vukasin-jovanovic/profil/spieler/264140', '/zanka/profil/spieler/52059',
            '/sasa-lukic/profil/spieler/245056', '/dusan-vlahovic/profil/spieler/357498', '/trezeguet/profil/spieler/234189',
            '/florian-wirtz/profil/spieler/598577']
matched3 = pd.DataFrame({'player_link_left': fb_links, 'player_link_right': tm_links})

# Combine matches into a single table of links

In [None]:
all_matched = pd.concat([matched3,
                         matched2[['player_link_left', 'player_link_right']],
                         matched[['player_link_left', 'player_link_right']]])
all_matched.rename({'player_link_left': 'player_link_fbref', 'player_link_right': 'player_link_tm'},
                   axis='columns', inplace=True)
print('Check no duplicated fbref links:', all_matched.duplicated('player_link_fbref', keep=False).sum())
print('Check no duplicated transfermarkt links:', all_matched.duplicated('player_link_tm', keep=False).sum())
print('Check there are the same players in each dataset (i.e. the symmetric difference is empty):',
      set(all_matched.player_link_fbref).symmetric_difference(set(df.player_link)))

# Join on transfermarkt details to fbref data and save as a parquet file

In [None]:
df_tm_2020 = df_tm_2020[df_tm_2020.player_link.isin(all_matched.player_link_tm)].copy()
df_tm = df_tm[(df_tm.player_link.isin(all_matched.player_link_tm)) & (~df_tm.player_link.isin(df_tm_2020.player_link))].copy()
df_tm = pd.concat([df_tm_2020, df_tm])

# Manually fill in missing dob

In [None]:
df_tm.loc[(df_tm.player == 'Florian Wirtz') & (df_tm.dob.isnull()), 'dob'] = pd.to_datetime('2003-05-03')

# Calculate age in September

In [None]:
df_tm['age_at_2020_09_01'] =  np.round((pd.to_datetime('2020-09-01') - df_tm.dob).dt.days / 365.25, 1)

# Format and save dataframe

In [None]:
df_tm.drop(['team_name', 'player_name', 'transfer_details', 'born', 'signed_from_link'], axis='columns', inplace=True)
df_tm['year'] = pd.to_numeric(df_tm.year, errors='coerce')
df.rename({'year': 'market_value_year'}, axis='columns', inplace=True)
cols = df_tm.columns
df_tm.columns = [c+'_tm' for c in cols]
df_tm = df_tm.merge(all_matched, how='inner', validate='1:1', on='player_link_tm')
df.rename({'player_link': 'player_link_fbref'}, axis='columns', inplace=True)
df.drop('player_name', axis='columns', inplace=True)
df = df_tm.merge(df, how='right', on='player_link_fbref', validate='1:1')
df.to_parquet(os.path.join('data', 'fbref_tm_combined.parquet'))