<a href="https://colab.research.google.com/github/aritrartira/Similar-Players-using-fbref-data/blob/main/Fbref_Similar_Player_and_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection

In [None]:
# File names to change if needed
raw_nongk = 'Raw FBRef 2022-2023'
raw_gk = 'Raw FBRef GK 2022-2023'
final_nongk = 'Final FBRef 2022-2023'
final_gk = 'Final FBRef GK 2022-2023'

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from pathlib import Path
import time

# this is the file path root, i.e. where this file is located
root = str(Path(os.getcwd()).parents[0]).replace('\\','/')+'/'

# This section creates the programs that gather data from FBRef.com
def _get_table(soup):
    return soup.find_all('table')[0]

def _get_opp_table(soup):
    return soup.find_all('table')[1]

def _parse_row(row):
    cols = None
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    return cols

def get_df(path):
    URL = path
    time.sleep(4)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    table = _get_table(soup)
    data = []
    headings=[]
    headtext = soup.find_all("th",scope="col")
    for i in range(len(headtext)):
        heading = headtext[i].get_text()
        headings.append(heading)
    headings=headings[1:len(headings)]
    data.append(headings)
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')

    for row_index in range(len(rows)):
        row = rows[row_index]
        cols = _parse_row(row)
        data.append(cols)
    
    data = pd.DataFrame(data)
    data = data.rename(columns=data.iloc[0])
    data = data.reindex(data.index.drop(0))
    data = data.replace('',0)
    return data

# this section gets the raw tables from FBRef.com

standard = "https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats"
shooting = "https://fbref.com/en/comps/Big5/2021-2022/shooting/players/2021-2022-Big-5-European-Leagues-Stats"
passing = "https://fbref.com/en/comps/Big5/2021-2022/passing/players/2021-2022-Big-5-European-Leagues-Stats"
pass_types = "https://fbref.com/en/comps/Big5/2021-2022/passing_types/players/2021-2022-Big-5-European-Leagues-Stats"
gsca = "https://fbref.com/en/comps/Big5/2021-2022/gca/players/2021-2022-Big-5-European-Leagues-Stats"
defense = "https://fbref.com/en/comps/Big5/2021-2022/defense/players/2021-2022-Big-5-European-Leagues-Stats"
poss = "https://fbref.com/en/comps/Big5/2021-2022/possession/players/2021-2022-Big-5-European-Leagues-Stats"
misc = "https://fbref.com/en/comps/Big5/2021-2022/misc/players/2021-2022-Big-5-European-Leagues-Stats"

df_standard = get_df(standard)
df_shooting = get_df(shooting)
df_passing = get_df(passing)
df_pass_types = get_df(pass_types)
df_gsca = get_df(gsca)
df_defense = get_df(defense)
df_poss = get_df(poss)
df_misc = get_df(misc)

# this section sorts the raw tables then resets their indexes. Without this step, you will
# run into issues with players who play minutes for 2 clubs in a season.

df_standard.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_shooting.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_passing.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_pass_types.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_gsca.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_defense.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_poss.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_misc.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)

df_standard = df_standard.reset_index(drop=True)
df_shooting = df_shooting.reset_index(drop=True)
df_passing = df_passing.reset_index(drop=True)
df_pass_types = df_pass_types.reset_index(drop=True)
df_gsca = df_gsca.reset_index(drop=True)
df_defense = df_defense.reset_index(drop=True)
df_poss = df_poss.reset_index(drop=True)
df_misc = df_misc.reset_index(drop=True)

# Now the fun part... merging all raw tables into one.
# Change any column name you want to change:
# Example --   'Gls': 'Goals'  changes column "Gls" to be named "Goals", etc.

df = df_standard.iloc[:, 0:10]
df = df.join(df_standard.iloc[:, 13])
df = df.join(df_standard.iloc[:, 26])
df = df.rename(columns={'G-PK': 'npGoals'})
df = df.join(df_shooting.iloc[:,8:25])
df = df.rename(columns={'Gls': 'Goals', 'Sh': 'Shots', 'SoT': 'SoT', 'SoT%': 'SoT%', 'Sh/90': 'Sh/90', 'SoT/90': 'SoT/90', 'G/Sh': 'G/Sh', 'G/SoT': 'G/SoT', 'Dist': 'AvgShotDistance', 'FK': 'FKShots', 'PK': 'PK', 'PKatt': 'PKsAtt', 'xG': 'xG', 'npxG': 'npxG', 'npxG/Sh': 'npxG/Sh', 'G-xG': 'G-xG', 'np:G-xG': 'npG-xG'})

df = df.join(df_passing.iloc[:,8:13])
df = df.rename(columns={'Cmp': 'PassesCompleted', 'Att': 'PassesAttempted', 'Cmp%': 'TotCmp%', 'TotDist': 'TotalPassDist', 'PrgDist': 'ProgPassDist', })
df = df.join(df_passing.iloc[:,13:16])
df = df.rename(columns={'Cmp': 'ShortPassCmp', 'Att': 'ShortPassAtt', 'Cmp%': 'ShortPassCmp%', })
df = df.join(df_passing.iloc[:,16:19])
df = df.rename(columns={'Cmp': 'MedPassCmp', 'Att': 'MedPassAtt', 'Cmp%': 'MedPassCmp%', })
df = df.join(df_passing.iloc[:,19:22])
df = df.rename(columns={'Cmp': 'LongPassCmp', 'Att': 'LongPassAtt', 'Cmp%': 'LongPassCmp%', })
df = df.join(df_passing.iloc[:,22:30])
df = df.rename(columns={'Ast': 'Assists', 'xA': 'xA', 'A-xA': 'A-xA', 'KP': 'KeyPasses', '1/3': 'Final1/3Cmp', 'PPA': 'PenAreaCmp', 'CrsPA': 'CrsPenAreaCmp', 'Prog': 'ProgPasses', })

df = df.join(df_pass_types.iloc[:, 9:28])
df = df.rename(columns={'Live': 'LivePass', 'Dead': 'DeadPass', 'FK': 'FKPasses', 'TB': 'ThruBalls', 'Press': 'PassUnderPress', 'Sw': 'Switches', 'Crs': 'Crs', 'CK': 'CK', 'In': 'InSwingCK', 'Out': 'OutSwingCK', 'Str': 'StrCK', 'Ground': 'Ground', 'Low': 'Low', 'High': 'High', 'Left': 'Left', 'Right': 'Right', 'Head': 'Head', 'TI': 'ThrowIn', 'Other': 'Other', })
df = df.join(df_pass_types.iloc[:, 29:33])
df = df.rename(columns={'Off': 'PassesToOff', 'Out': 'PassesOOB', 'Int': 'PassesInt', 'Blocks': 'PassesBlocked', })

df = df.join(df_gsca.iloc[:, 8:24])
df = df.rename(columns={'SCA': 'SCA', 'SCA90': 'SCA90', 'PassLive': 'SCAPassLive', 'PassDead': 'SCAPassDead', 'Drib': 'SCADrib', 'Sh': 'SCASh', 'Fld': 'SCAFld', 'Def': 'SCADef', 'GCA': 'GCA', 'GCA90': 'GCA90', 'PassLive': 'GCAPassLive', 'PassDead': 'GCAPassDead', 'Drib': 'GCADrib', 'Sh': 'GCASh', 'Fld': 'GCAFld', 'Def': 'GCADef', })

df = df.join(df_defense.iloc[:,8:31])
df = df.rename(columns={'Tkl': 'Tkl', 'TklW': 'TklWinPoss', 'Def 3rd': 'Def3rdTkl', 'Mid 3rd': 'Mid3rdTkl', 'Att 3rd': 'Att3rdTkl', 'Tkl': 'DrbTkl', 'Att': 'DrpPastAtt', 'Tkl%': 'DrbTkl%', 'Past': 'DrbPast', 'Press': 'Press', 'Succ': 'PressSucc', '%': 'PressSucc%', 'Def 3rd': 'Def3rdPress', 'Mid 3rd': 'Mid3rdPress', 'Att 3rd': 'Att3rdPress', 'Blocks': 'Blocks', 'Sh': 'ShBlocks', 'ShSv': 'ShSvBlocks', 'Pass': 'PassBlocks', 'Int': 'Int', 'Tkl+Int': 'Tkl+Int', 'Clr': 'Clr', 'Err': 'Err', })

df = df.join(df_poss.iloc[:,8:31])
df = df.rename(columns={'Touches': 'Touches', 'Def Pen': 'DefPenTouch', 'Def 3rd': 'Def3rdTouch', 'Mid 3rd': 'Mid3rdTouch', 'Att 3rd': 'Att3rdTouch', 'Att Pen': 'AttPenTouch', 'Live': 'LiveTouch', 'Succ': 'SuccDrb', 'Att': 'AttDrb', 'Succ%': 'DrbSucc%', '#Pl': 'PlDrbPast', 'Megs': 'Megs', 'Carries': 'Carries', 'TotDist': 'TotCarryDist', 'PrgDist': 'PrgCarryDist', 'Prog': 'ProgCarries', '1/3': 'CarriesToFinal3rd', 'CPA': 'CarriesToPenArea', 'Mis': 'CarryMistakes', 'Dis': 'Disposesed', 'Targ': 'PassTarget', 'Rec': 'ReceivedPass', 'Rec%': 'Receive%', })
df = df.join(df_poss.iloc[:,31])
df = df.rename(columns={'Prog': 'ProgPassesRec'})

df = df.join(df_misc.iloc[:, 8:14])
df = df.rename(columns={'CrdY': 'Yellows', 'CrdR': 'Reds', '2CrdY': 'Yellow2', 'Fls': 'Fls', 'Fld': 'Fld', 'Off': 'Off', })
df = df.join(df_misc.iloc[:,17:24])
df = df.rename(columns={'PKwon': 'PKwon', 'PKcon': 'PKcon', 'OG': 'OG', 'Recov': 'Recov', 'Won': 'AerialWins', 'Lost': 'AerialLoss', 'Won%': 'AerialWin%', })

# Make sure to drop all blank rows (FBRef's tables have several)
df.dropna(subset = ["Player"], inplace=True)

# Turn the minutes columns to integers. So from '1,500' to '1500'. Otherwise it can't do calculations with minutes
for i in range(0,len(df)):
    df.iloc[i][9] = df.iloc[i][9].replace(',','')
df.iloc[:,9:] = df.iloc[:,9:].apply(pd.to_numeric)

# Save the file to the root location
df.to_csv("%s%s.csv" %(root, raw_nongk), index=False)


##################################################################################
############################## GK SECTION ########################################
##################################################################################

gk = "https://fbref.com/en/comps/Big5/keepers/players/Big-5-European-Leagues-Stats"
advgk = "https://fbref.com/en/comps/Big5/keepersadv/players/Big-5-European-Leagues-Stats"

df_gk = get_df(gk)
df_advgk = get_df(advgk)

df_gk.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)
df_advgk.sort_values(['Player', 'Squad'], ascending=[True, True], inplace=True)

df_gk = df_gk.reset_index(drop=True)
df_advgk = df_advgk.reset_index(drop=True)

###############################################################################

df = pd.read_csv("%s%s.csv" %(root, raw_nongk))
df = df[df['Pos'].str.contains("GK")].reset_index().iloc[:,1:]
df_gk['Pos'] = df_gk['Pos'].astype(str)
df_gk = df_gk[df_gk['Pos'].str.contains('GK')]
df_gk = df_gk.reset_index().iloc[:,1:]
df_gk = df_gk.rename(columns={'PKatt':'PKsFaced'})

df = df.join(df_gk.iloc[:, 11:26].astype(float), lsuffix='.1', rsuffix='.2')
df = df.rename(columns={'GA': 'GA', 'GA90': 'GA90', 'SoTA': 'SoTA', 'Saves': 'Saves', 'Save%': 'Save%', 'W': 'W', 'D': 'D', 'L': 'L', 'CS': 'CS', 'CS%': 'CS%', 'PKsFaced': 'PKsFaced', 'PKA': 'PKA', 'PKsv': 'PKsv', 'PKm': 'PKm', 'Save%': 'PKSave%', })

df_advgk['Pos'] = df_advgk['Pos'].astype(str)
df_advgk = df_advgk[df_advgk['Pos'].str.contains('GK')]
df_advgk = df_advgk.reset_index().iloc[:,1:]
df_advgk = df_advgk.rename(columns={'PKA': 'PKGA', 'FK': 'FKGA', 'CK': 'CKGA', 'OG': 'OGA', 'PSxG': 'PSxG', 'PSxG/SoT': 'PSxG/SoT', 'PSxG+/-': 'PSxG+/-', 'PSxG+/- /90': 'PSxG+/- /90', 'Cmp': 'LaunchCmp', 'Att': 'LaunchAtt', 'Cmp%': 'LaunchPassCmp%', 'Att': 'PassAtt', 'Thr': 'PassThr', 'Launch%': 'PassesLaunch%', 'AvgLen': 'AvgLenLaunch', 'Att': 'GoalKicksAtt', 'Launch%': 'GoalKicksLaunch%', 'AvgLen': 'AvgLen', 'Opp': 'OppCrs', 'Stp': 'StpCrs', 'Stp%': 'CrsStp%', '#OPA': '#OPA', '#OPA/90': '#OPA/90', 'AvgDist': 'AvgDistOPA', })

df = df.join(df_advgk.iloc[:,9:33].astype(float), lsuffix='.1', rsuffix='.2')

df.to_csv("%s%s.csv" %(root,raw_gk), index=False)


##################################################################################
##################### Final file for outfield data ###############################
##################################################################################

df = pd.read_csv("%s%s.csv" %(root, raw_nongk))
df_90s = pd.read_csv("%s%s.csv" %(root, raw_nongk))
df_90s['90s'] = df_90s['Min']/90
for i in range(10,149):
    df_90s.iloc[:,i] = df_90s.iloc[:,i]/df_90s['90s']
df_90s = df_90s.iloc[:,10:].add_suffix('Per90')
df_new = df.join(df_90s)

## This commented text below is here in case FBRef ever makes their age "24-231" again, with years-days
# for i in range(len(df_new)):
#     df_new['Age'][i] = df_new['Age'][i][:2]

df_new.to_csv("%s%s.csv" %(root, final_nongk), index=False)


##################################################################################
##################### Final file for keeper data #################################
##################################################################################

df = pd.read_csv("%s%s.csv" %(root, raw_gk))
df_90s = pd.read_csv("%s%s.csv" %(root, raw_gk))
df_90s['90s'] = df_90s['Min']/90
for i in range(10,189):
    df_90s.iloc[:,i] = df_90s.iloc[:,i]/df_90s['90s']
df_90s = df_90s.iloc[:,10:].add_suffix('Per90')
df_new = df.join(df_90s)

## This commented text below is here in case FBRef ever makes their age "24-231" again, with years-days
# for i in range(len(df_new)):
#     df_new['Age'][i] = df_new['Age'][i][:2]

df_new.to_csv("%s%s.csv" %(root, final_gk), index=False)


##################################################################################
################ Download team data, for possession-adjusting ####################
##################################################################################

standard = "https://fbref.com/en/comps/Big5/stats/squads/Big-5-European-Leagues-Stats"
poss = "https://fbref.com/en/comps/Big5/possession/squads/Big-5-European-Leagues-Stats"

df_standard = get_df(standard)
df_poss = get_df(poss)

df_standard = df_standard.reset_index(drop=True)
df_poss = df_poss.reset_index(drop=True)

############################################

df = df_standard.iloc[:, 0:30]

# Gets the number of touches a team has per 90
df['TeamTouches90'] = float(0.0)
for i in range(len(df)):
    df.iloc[i,30] = float(df_poss.iloc[i,5]) / float(df_poss.iloc[i,4])

# Take out the comma in minutes like above
for j in range(0,len(df)):
    df.at[j,'Min'] = df.at[j,'Min'].replace(',','')
df.iloc[:,7:] = df.iloc[:,7:].apply(pd.to_numeric)
df.to_csv("%s%s TEAMS.csv" %(root, final_nongk), index=False)


##################################################################################
################ Download opposition data, for possession-adjusting ##############
##################################################################################

opp_poss = "https://fbref.com/en/comps/Big5/possession/squads/Big-5-European-Leagues-Stats"

df_opp_poss = get_df(opp_poss)
df_opp_poss = df_opp_poss.reset_index(drop=True)

############################################

df = df_opp_poss.iloc[:, 0:30]
df = df.rename(columns={'Touches':'Opp Touches'})
df = df.reset_index()

#############################################

df1 = pd.read_csv("%s%s TEAMS.csv"%(root, final_nongk))

df1['Opp Touches'] = 1
for i in range(len(df1)):
    df1['Opp Touches'][i] = df['Opp Touches'][i]
df1 = df1.rename(columns={'Min':'Team Min'})
df1.to_csv("%s%s TEAMS.csv" %(root, final_nongk), index=False)


##################################################################################
################ Make the final, complete, outfield data file ####################
##################################################################################

df = pd.read_csv("%s%s.csv" %(root, final_nongk))
teams = pd.read_csv("%s%s TEAMS.csv" %(root, final_nongk))

df['AvgTeamPoss'] = float(0.0)
df['OppTouches'] = int(1)
df['TeamMins'] = int(1)
df['TeamTouches90'] = float(0.0)

player_list = list(df['Player'])

for i in range(len(player_list)):
    team_name = df[df['Player']==player_list[i]]['Squad'].values[0]
    team_poss = teams[teams['Squad']==team_name]['Poss'].values[0]
    opp_touch = teams[teams['Squad']==team_name]['Opp Touches'].values[0]
    team_mins = teams[teams['Squad']==team_name]['Team Min'].values[0]
    team_touches = teams[teams['Squad']==team_name]['TeamTouches90'].values[0]
    df.at[i, 'AvgTeamPoss'] = team_poss
    df.at[i, 'OppTouches'] = opp_touch
    df.at[i, 'TeamMins'] = team_mins
    df.at[i, 'TeamTouches90'] = team_touches

# All of these are the possession-adjusted columns. A couple touch-adjusted ones at the bottom
df['pAdjTkl+IntPer90'] = (df['Tkl+IntPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjPressSuccPer90'] = (df['PressSuccPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjClrPer90'] = (df['ClrPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjShBlocksPer90'] = (df['ShBlocksPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjIntPer90'] = (df['IntPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrbTklPer90'] = (df['DrbTklPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjTklWinPossPer90'] = (df['DrbTklPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrbPastPer90'] = (df['DrbPastPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjAerialWinsPer90'] = (df['AerialWinsPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjAerialLossPer90'] = (df['AerialLossPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrpPastAttPer90'] = (df['DrpPastAttPer90']/(100-df['AvgTeamPoss']))*30
df['TouchCentrality'] = (df['TouchesPer90']/df['TeamTouches90'])*100
df['Tkl+IntOppTouch'] = df['Tkl+Int'] /(df['OppTouches']*(df['Min']/df['TeamMins']))*600
df['pAdjTouchesPer90'] = (df['TouchesPer90']/(df['AvgTeamPoss']))*30
df['pAdjCarriesPer90'] = (df['Carries']/(df['Touches']))*60


# Finally, rename some of the columns that I always found were acting up
df = df.rename(columns={'GCAPassLive': 'SCAPassLive', 'GCAPassDead': 'SCAPassDead', 'GCADrib': 'SCADrib', 'GCASh': 'SCASh', 'GCAFld': 'SCAFld', 'GCADef': 'SCADef', 'GCAPassLive.1': 'GCAPassLive', 'GCAPassDead.1': 'GCAPassDead', 'GCADrib.1': 'GCADrib', 'GCASh.1': 'GCASh', 'GCAFld.1': 'GCAFld', 'GCADef.1': 'GCADef', })

# Now we'll add the players' actual positions, from @jaseziv, into the file
tm_pos = pd.read_csv('https://github.com/griffisben/Soccer-Analyses/blob/main/TransfermarktPositions-Jase_Ziv83.csv?raw=true')
df = pd.merge(df, tm_pos, on ='Player', how ='left')

df.to_csv("%s%s.csv" %(root, final_nongk), index=False)


##################################################################################
################ Make the final, complete, keepers data file #####################
##################################################################################

df = pd.read_csv("%s%s.csv" %(root, final_gk))
teams = pd.read_csv("%s%s TEAMS.csv" %(root, final_nongk))

df['AvgTeamPoss'] = float(0.0)
df['OppTouches'] = int(1)
df['TeamMins'] = int(1)
df['TeamTouches90'] = float(0.0)

player_list = list(df['Player'])

for i in range(len(player_list)):
    team_name = df[df['Player']==player_list[i]]['Squad'].values[0]
    team_poss = teams[teams['Squad']==team_name]['Poss'].values[0]
    opp_touch = teams[teams['Squad']==team_name]['Opp Touches'].values[0]
    team_mins = teams[teams['Squad']==team_name]['Team Min'].values[0]
    team_touches = teams[teams['Squad']==team_name]['TeamTouches90'].values[0]
    df.at[i, 'AvgTeamPoss'] = team_poss
    df.at[i, 'OppTouches'] = opp_touch
    df.at[i, 'TeamMins'] = team_mins
    df.at[i, 'TeamTouches90'] = team_touches

# Same thing, makes pAdj stats for the GK file
df['pAdjTkl+IntPer90'] = (df['Tkl+IntPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjPressSuccPer90'] = (df['PressSuccPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjClrPer90'] = (df['ClrPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjShBlocksPer90'] = (df['ShBlocksPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjIntPer90'] = (df['IntPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrbTklPer90'] = (df['DrbTklPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjTklWinPossPer90'] = (df['DrbTklPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrbPastPer90'] = (df['DrbPastPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjAerialWinsPer90'] = (df['AerialWinsPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjAerialLossPer90'] = (df['AerialLossPer90']/(100-df['AvgTeamPoss']))*30
df['pAdjDrpPastAttPer90'] = (df['DrpPastAttPer90']/(100-df['AvgTeamPoss']))*30
df['TouchCentrality'] = (df['TouchesPer90']/df['TeamTouches90'])*100
df['pAdj#OPAPer90'] =(df['#OPAPer90']/(100-df['AvgTeamPoss']))*30
df['Tkl+IntOppTouch'] = df['Tkl+Int'] /(df['OppTouches']*(df['Min']/df['TeamMins']))*600
df['pAdjTouchesPer90'] = (df['TouchesPer90']/(df['AvgTeamPoss']))*30
df['pAdjCarriesPer90'] = (df['Carries']/(df['Touches']))*60 # 60 touches is roughly average for MFs in a match


df = df.rename(columns={'GCAPassLive': 'SCAPassLive', 'GCAPassDead': 'SCAPassDead', 'GCADrib': 'SCADrib', 'GCASh': 'SCASh', 'GCAFld': 'SCAFld', 'GCADef': 'SCADef', 'GCAPassLive.1': 'GCAPassLive', 'GCAPassDead.1': 'GCAPassDead', 'GCADrib.1': 'GCADrib', 'GCASh.1': 'GCASh', 'GCAFld.1': 'GCAFld', 'GCADef.1': 'GCADef', })

# Just adding the main positions to the GK too, but of course, they will all be GK lol. Keeps other program variables clean
tm_pos = pd.read_csv('https://github.com/griffisben/Soccer-Analyses/blob/main/TransfermarktPositions-Jase_Ziv83.csv?raw=true')
df = pd.merge(df, tm_pos, on ='Player', how ='left')

df.to_csv("%s%s.csv" %(root, final_gk), index=False)
print('Done :) Files are located at  %s' %root)

#Code

##variables

In [None]:
all_cols = ['npGoalsPer90',
 'npxG+xAPer90',
 'GoalsPer90',
 'ShotsPer90',
 'SoTPer90',
 'SoT%Per90',
 'Sh/90Per90',
 'SoT/90Per90',
 'G/ShPer90',
 'G/SoTPer90',
 'AvgShotDistancePer90',
 'FKShotsPer90',
 'PKPer90',
 'PKsAttPer90',
 'xGPer90',
 'npxGPer90',
 'npxG/ShPer90',
 'G-xGPer90',
 'npG-xGPer90',
 'PassesCompletedPer90',
 'PassesAttemptedPer90',
 'TotCmp%Per90',
 'TotalPassDistPer90',
 'ProgPassDistPer90',
 'ShortPassCmpPer90',
 'ShortPassAttPer90',
 'ShortPassCmp%Per90',
 'MedPassCmpPer90',
 'MedPassAttPer90',
 'MedPassCmp%Per90',
 'LongPassCmpPer90',
 'LongPassAttPer90',
 'LongPassCmp%Per90',
 'AssistsPer90',
 'xAPer90',
 'A-xAPer90',
 'KeyPassesPer90',
 'Final1/3CmpPer90',
 'PenAreaCmpPer90',
 'CrsPenAreaCmpPer90',
 'ProgPassesPer90',
 'LivePassPer90',
 'DeadPassPer90',
 'FKPassesPer90',
 'ThruBallsPer90',
 'PassUnderPressPer90',
 'SwitchesPer90',
 'CrsPer90',
 'CKPer90',
 'InSwingCKPer90',
 'OutSwingCKPer90',
 'StrCKPer90',
 'GroundPer90',
 'LowPer90',
 'HighPer90',
 'LeftPer90',
 'RightPer90',
 'HeadPer90',
 'ThrowInPer90',
 'OtherPer90',
 'PassesToOffPer90',
 'PassesOOBPer90',
 'PassesIntPer90',
 'PassesBlockedPer90',
 'SCAPer90',
 'SCA90Per90',
 'GCAPassLivePer90',
 'GCAPassDeadPer90',
 'GCADribPer90',
 'GCAShPer90',
 'GCAFldPer90',
 'GCADefPer90',
 'GCAPer90',
 'GCA90Per90',
 'GCAPassLive.1Per90',
 'GCAPassDead.1Per90',
 'GCADrib.1Per90',
 'GCASh.1Per90',
 'GCAFld.1Per90',
 'GCADef.1Per90',
 'DrbTklPer90',
 'TklWinPossPer90',
 'Def3rdPressPer90',
 'Mid3rdPressPer90',
 'Att3rdPressPer90',
 'DrbTkl.1Per90',
 'DrpPastAttPer90',
 'DrbTkl%Per90',
 'DrbPastPer90',
 'PressPer90',
 'PressSuccPer90',
 'PressSucc%Per90',
 'Def3rdPress.1Per90',
 'Mid3rdPress.1Per90',
 'Att3rdPress.1Per90',
 'BlocksPer90',
 'ShBlocksPer90',
 'ShSvBlocksPer90',
 'PassBlocksPer90',
 'IntPer90',
 'Tkl+IntPer90',
 'ClrPer90',
 'ErrPer90',
 'TouchesPer90',
 'DefPenTouchPer90',
 'Def3rdTouchPer90',
 'Mid3rdTouchPer90',
 'Att3rdTouchPer90',
 'AttPenTouchPer90',
 'LiveTouchPer90',
 'SuccDrbPer90',
 'AttDrbPer90',
 'DrbSucc%Per90',
 'PlDrbPastPer90',
 'MegsPer90',
 'CarriesPer90',
 'TotCarryDistPer90',
 'PrgCarryDistPer90',
 'ProgCarriesPer90',
 'CarriesToFinal3rdPer90',
 'CarriesToPenAreaPer90',
 'CarryMistakesPer90',
 'DisposesedPer90',
 'PassTargetPer90',
 'ReceivedPassPer90',
 'Receive%Per90',
 'ProgPassesRecPer90',
 'YellowsPer90',
 'RedsPer90',
 'Yellow2Per90',
 'FlsPer90',
 'FldPer90',
 'OffPer90',
 'PKwonPer90',
 'PKconPer90',
 'OGPer90',
 'RecovPer90',
 'AerialWinsPer90',
 'AerialLossPer90',
 'AerialWin%Per90']

non_perc_cols = [i for i in all_cols if '%' not in i]

perc_cols = [i for i in all_cols if '%' in i]

## Utility funcs

In [None]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

## Make Dataset

In [None]:
df = pd.read_csv("/Final FBRef 2022-2023.csv")

In [None]:
df = df.loc[df['Min'] >= 1000]
# df.loc[df["Squad"] == "Real Madrid"]
df

In [None]:
X = df[['npGoalsPer90',
 'npxG+xAPer90',
 'GoalsPer90',
 'ShotsPer90',
 'SoTPer90',
 'SoT%Per90',
 'Sh/90Per90',
 'SoT/90Per90',
 'G/ShPer90',
 'G/SoTPer90',
 'AvgShotDistancePer90',
 'FKShotsPer90',
 'PKPer90',
 'PKsAttPer90',
 'xGPer90',
 'npxGPer90',
 'npxG/ShPer90',
 'G-xGPer90',
 'npG-xGPer90',
 'PassesCompletedPer90',
 'PassesAttemptedPer90',
 'TotCmp%Per90',
 'TotalPassDistPer90',
 'ProgPassDistPer90',
 'ShortPassCmpPer90',
 'ShortPassAttPer90',
 'ShortPassCmp%Per90',
 'MedPassCmpPer90',
 'MedPassAttPer90',
 'MedPassCmp%Per90',
 'LongPassCmpPer90',
 'LongPassAttPer90',
 'LongPassCmp%Per90',
 'AssistsPer90',
 'xAPer90',
 'A-xAPer90',
 'KeyPassesPer90',
 'Final1/3CmpPer90',
 'PenAreaCmpPer90',
 'CrsPenAreaCmpPer90',
 'ProgPassesPer90',
 'LivePassPer90',
 'DeadPassPer90',
 'FKPassesPer90',
 'ThruBallsPer90',
 'PassUnderPressPer90',
 'SwitchesPer90',
 'CrsPer90',
 'CKPer90',
 'InSwingCKPer90',
 'OutSwingCKPer90',
 'StrCKPer90',
 'GroundPer90',
 'LowPer90',
 'HighPer90',
 'LeftPer90',
 'RightPer90',
 'HeadPer90',
 'ThrowInPer90',
 'OtherPer90',
 'PassesToOffPer90',
 'PassesOOBPer90',
 'PassesIntPer90',
 'PassesBlockedPer90',
 'SCAPer90',
 'SCA90Per90',
 'GCAPassLivePer90',
 'GCAPassDeadPer90',
 'GCADribPer90',
 'GCAShPer90',
 'GCAFldPer90',
 'GCADefPer90',
 'GCAPer90',
 'GCA90Per90',
 'GCAPassLive.1Per90',
 'GCAPassDead.1Per90',
 'GCADrib.1Per90',
 'GCASh.1Per90',
 'GCAFld.1Per90',
 'GCADef.1Per90',
 'DrbTklPer90',
 'TklWinPossPer90',
 'Def3rdPressPer90',
 'Mid3rdPressPer90',
 'Att3rdPressPer90',
 'DrbTkl.1Per90',
 'DrpPastAttPer90',
 'DrbTkl%Per90',
 'DrbPastPer90',
 'PressPer90',
 'PressSuccPer90',
 'PressSucc%Per90',
 'Def3rdPress.1Per90',
 'Mid3rdPress.1Per90',
 'Att3rdPress.1Per90',
 'BlocksPer90',
 'ShBlocksPer90',
 'ShSvBlocksPer90',
 'PassBlocksPer90',
 'IntPer90',
 'Tkl+IntPer90',
 'ClrPer90',
 'ErrPer90',
 'TouchesPer90',
 'DefPenTouchPer90',
 'Def3rdTouchPer90',
 'Mid3rdTouchPer90',
 'Att3rdTouchPer90',
 'AttPenTouchPer90',
 'LiveTouchPer90',
 'SuccDrbPer90',
 'AttDrbPer90',
 'DrbSucc%Per90',
 'PlDrbPastPer90',
 'MegsPer90',
 'CarriesPer90',
 'TotCarryDistPer90',
 'PrgCarryDistPer90',
 'ProgCarriesPer90',
 'CarriesToFinal3rdPer90',
 'CarriesToPenAreaPer90',
 'CarryMistakesPer90',
 'DisposesedPer90',
 'PassTargetPer90',
 'ReceivedPassPer90',
 'Receive%Per90',
 'ProgPassesRecPer90',
 'YellowsPer90',
 'RedsPer90',
 'Yellow2Per90',
 'FlsPer90',
 'FldPer90',
 'OffPer90',
 'PKwonPer90',
 'PKconPer90',
 'OGPer90',
 'RecovPer90',
 'AerialWinsPer90',
 'AerialLossPer90',
 'AerialWin%Per90']].to_numpy()

In [None]:
X

## Return 5 most similar players

In [None]:
from difflib import get_close_matches
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
keyword = input()

search_result = get_close_matches(keyword, df['Player'])
search_result

In [None]:
for i in range(len(df['Player'])):
    if df['Player'].iloc[i] == search_result[0]:
        key_index = i

cos_sims = list()

for i in range(X.shape[0]):
    if i != key_index:
        cos_sims.append((df['Player'].iloc[i], distance.cosine(X[i], X[key_index])))

cos_sims.sort(key = lambda x : x[1])

cos_sims[0], cos_sims[1]

## Clustering

### Basic

In [None]:
from itertools import combinations
import numpy as np
from sklearn.cluster import KMeans

def computeClusterCentroids(X, y):
    labels = np.unique(y)

    cluster_centroids = dict()
    for label in labels:
        temp = X[y == label, :]

        cluster_centroids[label] = np.mean(temp, axis = 0)
    return cluster_centroids

def computeNumerator(X, y, cluster_centroids):
    labels = cluster_centroids.keys()
    num = 0

    for label in labels:
        temp = X[y == label, :]

        temp = np.sum(np.sqrt(np.sum(np.power(temp - cluster_centroids[label], 2), axis = 1)))
        num = num + temp
    
    #num = num / 83
    return num

def computeDenominator(cluster_centroids):
    combs = combinations(cluster_centroids.keys(), 2)
    den = 0

    for comb in combs:
        den += np.sqrt(np.sum(np.power(cluster_centroids[comb[0]] - cluster_centroids[comb[1]], 2)))

    return den

def computeXieBini(X, y):
    cluster_centroids = computeClusterCentroids(X, y)
    return computeNumerator(X, y, cluster_centroids) / computeDenominator(cluster_centroids)

def make_clusters(X, low, high):
    temp = dict()
    for n in range(low, high + 1, 1):
        print(n)
        model = KMeans(n_clusters=n)
        y = model.fit_predict(X)
        temp[n] = computeXieBini(X, y)

    n = min(temp.items(), key = lambda x : x[1])[0]
    model = KMeans(n_clusters=n)
    y_res = model.fit_predict(X)
    return y_res

In [None]:
# model = KMeans(n_clusters=12)
# y = model.fit_predict(X)

y = make_clusters(X, 50, 55)

In [None]:
clusters_dict = dict()

for i in range(y.shape[0]):
    if y[i] not in clusters_dict.keys():
        clusters_dict[y[i]] = [df.iloc[i]['Player']]
    else:
        clusters_dict[y[i]].append(df.iloc[i]['Player'])

In [None]:
clusters_dict

### Clustering With MLE PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components='mle')
X = pca.fit_transform(X)

X.shape

In [None]:
y = make_clusters(X, 30, 35)

clusters_dict = dict()

for i in range(y.shape[0]):
    if y[i] not in clusters_dict.keys():
        clusters_dict[y[i]] = [df.iloc[i]['Player']]
    else:
        clusters_dict[y[i]].append(df.iloc[i]['Player'])

In [None]:
clusters_dict

### Clustering with n Component PCA

In [None]:
n = 100

from sklearn.decomposition import PCA

pca = PCA(n_components=n)
X = pca.fit_transform(X)

X.shape

In [None]:
y = make_clusters(X, 30, 35)

clusters_dict = dict()

for i in range(y.shape[0]):
    if y[i] not in clusters_dict.keys():
        clusters_dict[y[i]] = [df.iloc[i]['Player']]
    else:
        clusters_dict[y[i]].append(df.iloc[i]['Player'])

In [None]:
clusters_dict

### Clustering With Gaussian Random Distribution

In [None]:
from sklearn.random_projection import GaussianRandomProjection

transformer = GaussianRandomProjection(eps=0.99)
X_new = transformer.fit_transform(X)

X_new.shape

In [None]:
y = make_clusters(X_new, 30, 35)

clusters_dict = dict()

for i in range(y.shape[0]):
    if y[i] not in clusters_dict.keys():
        clusters_dict[y[i]] = [df.iloc[i]['Player']]
    else:
        clusters_dict[y[i]].append(df.iloc[i]['Player'])

In [None]:
clusters_dict

### Clustering with Normalized data with MLE PCA

In [None]:
import numpy as np

X = df[non_perc_cols].to_numpy()

for i in range(X.shape[0]):
    X[i] = normalize(X[i])

X.shape

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components='mle')
X = pca.fit_transform(X)

X.shape

In [None]:
y = make_clusters(X, 30, 35)

clusters_dict = dict()

for i in range(y.shape[0]):
    if y[i] not in clusters_dict.keys():
        clusters_dict[y[i]] = [df.iloc[i]['Player']]
    else:
        clusters_dict[y[i]].append(df.iloc[i]['Player'])

clusters_dict