# Soccer player scouting through K-Means clustering using PCA

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading in webscraped data from fbref.com

In [13]:
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/aaronwoodward1/Soccer_Analytics/main/fbref_outfield_players.csv')
df.head()

  df = pd.read_csv('https://raw.githubusercontent.com/aaronwoodward1/Soccer_Analytics/main/fbref_outfield_players.csv')


Unnamed: 0,Player,Nation,Position,Squad,Age,Born,MP,Starts,Min,No_90s,...,CrdY2,Fls_Comm,Offsides,PK_Won,PK_Conv,Own_Goal,Ball_Recoveries,Aerial Duels_Won,Aerial Duels_Won%,League
0,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,1237,13.7,...,0.0,12.0,2.0,0.0,1.0,0.0,75.0,5.0,31.3,Premier League
1,Brenden Aaronson,us USA,"MF,FW",Union Berlin,22,2000,30,14,1267,14.1,...,1.0,15.0,5.0,0.0,0.0,0.0,88.0,13.0,44.8,Bundesliga
2,Paxten Aaronson,us USA,MF,Eint Frankfurt,19,2003,7,1,101,1.1,...,0.0,6.0,0.0,0.0,0.0,0.0,5.0,3.0,100.0,Bundesliga
3,Keyliane Abdallah,fr FRA,FW,Marseille,17,2006,1,0,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ligue 1
4,Yunis Abdelhamid,ma MAR,DF,Reims,35,1987,31,31,2781,30.9,...,0.0,26.0,0.0,0.0,0.0,1.0,149.0,61.0,62.2,Ligue 1


## Inspecting the dataframe

In [None]:
df.info()

## Data engineering "Age" column

In [3]:
# Splititng the Age in years and days
df[['Age_yrs','Age_days']] = df.Age.apply(lambda x: pd.Series(str(x).split("-")))

# We just want Age in years with no hyphens, so we'll drop the original "Age" column and the newly "Age_days" columns. 
df = df.drop(['Age','Age_days'], axis=1)

# Rename the "Age_yrs" column as "Age"
df = df.rename(columns={'Age_yrs':'Age'})

# Convert "Age" into an integer datatype so that we can filter the dataframe by age.
df['Age'] = df['Age'].astype('int')


## Filtering by players that are 25 and younger

In [5]:
df = df[df['Age'] < 26]

## Observing the different positions in the dataframe

In [7]:
df['Position'].unique()

## Filtering by position

In [9]:
positions = ['MF','MF,DF']
df = df[df['Position'].isin(positions)]

## Filtering eligible players (players that we want in our dataset) by number of per 90 minutes played

In [11]:
df = df[df['No_90s']>=5.0]

## Shape of the filtered dataframe

In [10]:
df.shape

(416, 143)

## Creating a DataFrame with only stat categories that are more relevant for a certain position

In [29]:
# Slicing the DataFrame based on statistical categories relevant to central midfielders
cenmf_df = df[['Player','No_90s','npxG+xAG_90', 'Prg_Carr', 'Prg_Pass','Prg_Pass_Rec',
                   'Take-Ons_Succ','Take-Ons_Succ%','Carries_PrgDist','Carries_1/3',
                   'Medium_Cmp','Medium_Cmp%','Long_Cmp','Long_Cmp%','KP','Pass_Fin_3rd',
                   'Pass Types_Sw','SCA_90','GCA_90','Tkls','Tkls_Won','Drib_Tkl','Int',
                   'Ball_Recoveries']]


In [30]:
cenmf_df.head(10)

Unnamed: 0,Player,No_90s,npxG+xAG_90,Prg_Carr,Prg_Pass,Prg_Pass_Rec,Take-Ons_Succ,Take-Ons_Succ%,Carries_PrgDist,Carries_1/3,Medium_Cmp,Medium_Cmp%,Long_Cmp,Long_Cmp%,KP,Pass_Fin_3rd,Pass Types_Sw,SCA_90,GCA_90,Tkls,Tkls_Won,Drib_Tkl,Int,Ball_Recoveries
5,Salis Abdul Samed,16.9,0.08,9,78,20,7.0,36.8,1683.0,33.0,330.0,91.7,41.0,75.9,6.0,87.0,1,1.6,0.18,21.0,14.0,8.0,12.0,89.0
28,Yacine Adli,15.6,0.16,30,125,26,22.0,68.8,2144.0,27.0,487.0,91.2,120.0,67.8,22.0,149.0,16,3.97,0.51,34.0,17.0,13.0,20.0,94.0
35,Lucien Agoume,8.6,0.17,7,35,9,7.0,87.5,590.0,3.0,120.0,88.9,31.0,73.8,9.0,36.0,2,2.43,0.12,23.0,18.0,9.0,7.0,48.0
69,Carles Aleñá,11.3,0.25,12,23,45,6.0,21.4,402.0,14.0,114.0,82.6,38.0,64.4,18.0,31.0,4,3.18,0.18,24.0,8.0,10.0,4.0,53.0
84,Sergi Altimira,6.3,0.07,10,31,8,6.0,54.5,465.0,4.0,110.0,94.8,30.0,83.3,4.0,26.0,4,2.22,0.0,15.0,8.0,11.0,9.0,41.0
85,Edson Álvarez,26.4,0.07,20,91,26,27.0,64.3,2227.0,27.0,401.0,90.1,87.0,65.9,12.0,87.0,9,1.21,0.08,80.0,39.0,31.0,42.0,164.0
87,Hugo Álvarez,8.5,0.3,36,34,80,16.0,55.2,1108.0,15.0,117.0,87.3,8.0,29.6,19.0,19.0,1,3.66,0.24,14.0,9.0,9.0,5.0,47.0
126,Oliver Arblaster,10.5,0.07,13,43,4,17.0,68.0,634.0,12.0,146.0,88.0,26.0,52.0,11.0,37.0,2,2.2,0.19,23.0,14.0,7.0,15.0,52.0
146,Kristjan Asllani,8.7,0.18,5,32,8,3.0,60.0,1030.0,7.0,284.0,94.0,78.0,65.0,8.0,47.0,4,2.18,0.57,7.0,2.0,2.0,11.0,57.0
153,Valentin Atangana Edoa,6.2,0.1,10,21,17,2.0,25.0,424.0,11.0,90.0,92.8,10.0,83.3,2.0,14.0,0,0.97,0.0,19.0,13.0,5.0,6.0,28.0


## Creating Stats per 90 variables for more important stat categories and incorporating them in the position dataframe

In [31]:
#Per 90 conversion for important variables
cenmf_df['Prg_Carr_90'] = cenmf_df['Prg_Carr']/cenmf_df['No_90s']
cenmf_df['Prg_Pass_90'] = cenmf_df['Prg_Pass']/cenmf_df['No_90s']
cenmf_df['Prg_Pass_Rec_90'] = cenmf_df['Prg_Pass_Rec']/cenmf_df['No_90s']
cenmf_df['Take-Ons_Succ_90'] = cenmf_df['Take-Ons_Succ']/cenmf_df['No_90s']
cenmf_df['Carries_PrgDist_90'] = cenmf_df['Carries_PrgDist']/cenmf_df['No_90s']
cenmf_df['Carries_1/3_90'] = cenmf_df['Carries_1/3']/cenmf_df['No_90s']
cenmf_df['Medium_Cmp_90'] = cenmf_df['Medium_Cmp']/cenmf_df['No_90s']
cenmf_df['Long_Cmp_90'] = cenmf_df['Long_Cmp']/cenmf_df['No_90s']
cenmf_df['KP_90'] = cenmf_df['KP']/cenmf_df['No_90s']
cenmf_df['Pass_Fin_3rd_90'] = cenmf_df['Pass_Fin_3rd']/cenmf_df['No_90s']
cenmf_df['Pass Types_Sw_90'] = cenmf_df['Pass Types_Sw']/cenmf_df['No_90s']
cenmf_df['Tkls_90'] = cenmf_df['Tkls']/cenmf_df['No_90s']
cenmf_df['Tkls_Won_90'] = cenmf_df['Tkls_Won']/cenmf_df['No_90s']
cenmf_df['Drib_Tkl_90'] = cenmf_df['Drib_Tkl']/cenmf_df['No_90s']
cenmf_df['Int_90'] = cenmf_df['Int']/cenmf_df['No_90s']
cenmf_df['Ball_Recoveries_90'] = cenmf_df['Ball_Recoveries']/cenmf_df['No_90s']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cenmf_df['Prg_Carr_90'] = cenmf_df['Prg_Carr']/cenmf_df['No_90s']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cenmf_df['Prg_Pass_90'] = cenmf_df['Prg_Pass']/cenmf_df['No_90s']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cenmf_df['Prg_Pass_Rec_90'] = cenmf_df['Prg_Pass_Rec']/cenmf_df['No_90s']

In [32]:
#Finalizing dataframe for analysis
cenmf_df = cenmf_df[['Player','No_90s','npxG+xAG_90', 'Prg_Carr_90', 'Prg_Pass_90','Prg_Pass_Rec_90',
               'Take-Ons_Succ_90','Take-Ons_Succ%','Carries_PrgDist_90','Carries_1/3_90',
               'Medium_Cmp_90','Medium_Cmp%','Long_Cmp_90','Long_Cmp%','KP_90',
               'Pass_Fin_3rd_90','Pass Types_Sw_90','SCA_90','GCA_90','Tkls_90',
               'Tkls_Won_90','Drib_Tkl_90','Int_90','Ball_Recoveries_90']]

cenmf_df.head(10)

Unnamed: 0,Player,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
5,Salis Abdul Samed,16.9,0.08,0.532544,4.615385,1.183432,0.414201,36.8,99.585799,1.952663,19.526627,91.7,2.426036,75.9,0.35503,5.147929,0.059172,1.6,0.18,1.242604,0.828402,0.473373,0.710059,5.266272
28,Yacine Adli,15.6,0.16,1.923077,8.012821,1.666667,1.410256,68.8,137.435897,1.730769,31.217949,91.2,7.692308,67.8,1.410256,9.551282,1.025641,3.97,0.51,2.179487,1.089744,0.833333,1.282051,6.025641
35,Lucien Agoume,8.6,0.17,0.813953,4.069767,1.046512,0.813953,87.5,68.604651,0.348837,13.953488,88.9,3.604651,73.8,1.046512,4.186047,0.232558,2.43,0.12,2.674419,2.093023,1.046512,0.813953,5.581395
69,Carles Aleñá,11.3,0.25,1.061947,2.035398,3.982301,0.530973,21.4,35.575221,1.238938,10.088496,82.6,3.362832,64.4,1.59292,2.743363,0.353982,3.18,0.18,2.123894,0.707965,0.884956,0.353982,4.690265
84,Sergi Altimira,6.3,0.07,1.587302,4.920635,1.269841,0.952381,54.5,73.809524,0.634921,17.460317,94.8,4.761905,83.3,0.634921,4.126984,0.634921,2.22,0.0,2.380952,1.269841,1.746032,1.428571,6.507937
85,Edson Álvarez,26.4,0.07,0.757576,3.44697,0.984848,1.022727,64.3,84.356061,1.022727,15.189394,90.1,3.295455,65.9,0.454545,3.295455,0.340909,1.21,0.08,3.030303,1.477273,1.174242,1.590909,6.212121
87,Hugo Álvarez,8.5,0.3,4.235294,4.0,9.411765,1.882353,55.2,130.352941,1.764706,13.764706,87.3,0.941176,29.6,2.235294,2.235294,0.117647,3.66,0.24,1.647059,1.058824,1.058824,0.588235,5.529412
126,Oliver Arblaster,10.5,0.07,1.238095,4.095238,0.380952,1.619048,68.0,60.380952,1.142857,13.904762,88.0,2.47619,52.0,1.047619,3.52381,0.190476,2.2,0.19,2.190476,1.333333,0.666667,1.428571,4.952381
146,Kristjan Asllani,8.7,0.18,0.574713,3.678161,0.91954,0.344828,60.0,118.390805,0.804598,32.643678,94.0,8.965517,65.0,0.91954,5.402299,0.45977,2.18,0.57,0.804598,0.229885,0.229885,1.264368,6.551724
153,Valentin Atangana Edoa,6.2,0.1,1.612903,3.387097,2.741935,0.322581,25.0,68.387097,1.774194,14.516129,92.8,1.612903,83.3,0.322581,2.258065,0.0,0.97,0.0,3.064516,2.096774,0.806452,0.967742,4.516129


In [None]:
cenmf_df.shape

In [None]:
# cenmf_df_3d = cenmf_df.copy()

In [18]:
from sklearn import preprocessing

player_names = cenmf_df['Player'].tolist()

cenmf_df = cenmf_df.drop(['Player'], axis = 1)

x =  cenmf_df.values
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
X_norm = pd.DataFrame(x_scaled)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
reduced = pd.DataFrame(pca.fit_transform(X_norm))
reduced.head()

In [None]:

from sklearn.cluster import KMeans

wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 101)
    kmeans.fit(reduced)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters (K)')
plt.ylabel('WCSS')


In [None]:
# Let's try K=6

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6)
kmeans = kmeans.fit(reduced)

labels = kmeans.predict(reduced)
clusters = kmeans.labels_.tolist()

In [None]:
reduced['cluster'] = clusters
reduced['name'] = player_names
reduced.columns = ['x','y','cluster','name']
reduced.head()

In [None]:
names = ['Rodri', 'Declan Rice', 'Conor Gallagher','Paulinha']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set(style="white")

ax = sns.lmplot(x="x", y="y", hue='cluster', data = reduced, legend=False,
                   fit_reg=False, scatter_kws={"s": 10})

# texts = []
# for i in names:
#     for x, y, s in zip(reduced.x, reduced.y, reduced.name):
#         texts.append(plt.text(i, x, y, s,))

# for i, name in enumerate(reduced.names):
#     ax.text(x[i], y[i], name)

ax.set(ylim=(-1.5, 1.5))

# for i in range(len(names)
               
# ax.annotate(names in reduced['names'])

# plt.tick_params(labelsize=7)
# plt.xlabel("PC 1", fontsize = 14)
# plt.ylabel("PC 2", fontsize = 14)
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()
plt.title('KMeans clustering - Central Midfielders',size=20)

# s ="@ricardoandreom\n"
# date = datetime.today()
# d = str(date.strftime('%Y-%m-%d'))
# plt.text(-1.1,-2.3, s, fontdict=None, fontsize=12, fontweight='heavy')
# plt.text(-1.1,-2.35, d, fontdict=None, fontsize=12, fontweight='heavy')

# plt.text(-1.1,-2.3, fontdict=None, fontsize=7)
# plt.text(-1.1,-2.35,fontdict=None, fontsize=7)
# date = datetime.today()

plt.show()

In [None]:
reduced.loc('Rodri')

In [None]:
### Player recommendation system

In [None]:
reduced1 = reduced.copy()

In [None]:
reduced1.head()

In [None]:
reduced1 = reduced1.rename(columns={'name':'Player'})

In [None]:
reduced1.columns

In [None]:
df.columns

In [None]:
# reduced1 = pd.merge(df[['Player', 'Squad', 'League', 'Age', 'No_90s', 'Nation']],reduced1, on='Player', how='left')


In [None]:
#Cosine similarity

In [19]:
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
reduced1.head()

NameError: name 'reduced1' is not defined

In [None]:
#Adding index

reduced1.reset_index(inplace=True)
reduced1.head()

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# x = reduced1.values

# scaler = MinMaxScaler()
# reduced1 = scaler.fit_transform(reduced1)

# reduced1.head()

In [None]:
#Model Building

#Cosine Similarity

In [None]:
player_names = reduced1['Player'].tolist()

x = reduced1.drop(['Player'], axis = 1)

x.head()

In [None]:


similarity_score = cosine_similarity(x)
print(similarity[:10])

In [None]:
player_name=input(' Enter player name : ')

In [None]:
print(player_names[:15])

In [None]:
#find close match
find_close_match = difflib.get_close_matches(player_name, player_names, cutoff=0.4)

In [None]:
print(find_close_match)

In [None]:
close_match=find_close_match[0]

In [None]:
print(close_match)

In [None]:
#Find index of the player based on player name

index_of_player = reduced1[reduced1['Player']==close_match]['index'].values[0]
print(index_of_player)

In [None]:
# Filtering the players that have high similarity scores

sorted_similar_players=sorted(similarity_score,key=lambda x:x[1], reverse=True)
print(sorted_similar_players[:10])


In [None]:
#Print the name of similar players

print('Similar players : \n')

i=1

for player in sorted_similar_players:
    index=player[0]
    player_from_index=reduced1[reduced1['index']==index]['Player'].values[0]
    if (i<11):
        print(i,' ',player_from_index)
        i+=1

In [None]:

player_name=input(' Enter player name : ')

player_names = reduced1['Player'].tolist()

find_close_match = difflib.get_close_matches(player_name, player_names, cutoff=0.4)

close_match=find_close_match[0]

index_of_player = reduced1[reduced1['Player']==close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_player]))

sorted_similar_players=sorted(similarity_score,key=lambda x:x[1], reverse=True)

print('Similar players : \n')

i=1

for player in sorted_similar_players:
    index=player[0]
    player_from_index=reduced1[reduced1['index']==index]['Player'].values[0]
    if (i<11):
        print(i,' ',player_from_index)
        i+=1

In [None]:
### Without PCA

In [33]:
cenmf_df.reset_index(inplace=True)

In [34]:
cenmf_df.head()

Unnamed: 0,index,Player,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
0,5,Salis Abdul Samed,16.9,0.08,0.532544,4.615385,1.183432,0.414201,36.8,99.585799,1.952663,19.526627,91.7,2.426036,75.9,0.35503,5.147929,0.059172,1.6,0.18,1.242604,0.828402,0.473373,0.710059,5.266272
1,28,Yacine Adli,15.6,0.16,1.923077,8.012821,1.666667,1.410256,68.8,137.435897,1.730769,31.217949,91.2,7.692308,67.8,1.410256,9.551282,1.025641,3.97,0.51,2.179487,1.089744,0.833333,1.282051,6.025641
2,35,Lucien Agoume,8.6,0.17,0.813953,4.069767,1.046512,0.813953,87.5,68.604651,0.348837,13.953488,88.9,3.604651,73.8,1.046512,4.186047,0.232558,2.43,0.12,2.674419,2.093023,1.046512,0.813953,5.581395
3,69,Carles Aleñá,11.3,0.25,1.061947,2.035398,3.982301,0.530973,21.4,35.575221,1.238938,10.088496,82.6,3.362832,64.4,1.59292,2.743363,0.353982,3.18,0.18,2.123894,0.707965,0.884956,0.353982,4.690265
4,84,Sergi Altimira,6.3,0.07,1.587302,4.920635,1.269841,0.952381,54.5,73.809524,0.634921,17.460317,94.8,4.761905,83.3,0.634921,4.126984,0.634921,2.22,0.0,2.380952,1.269841,1.746032,1.428571,6.507937


In [35]:
cenmf_df = cenmf_df.drop('index', axis=1)

In [36]:
cenmf_df.head()

Unnamed: 0,Player,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
0,Salis Abdul Samed,16.9,0.08,0.532544,4.615385,1.183432,0.414201,36.8,99.585799,1.952663,19.526627,91.7,2.426036,75.9,0.35503,5.147929,0.059172,1.6,0.18,1.242604,0.828402,0.473373,0.710059,5.266272
1,Yacine Adli,15.6,0.16,1.923077,8.012821,1.666667,1.410256,68.8,137.435897,1.730769,31.217949,91.2,7.692308,67.8,1.410256,9.551282,1.025641,3.97,0.51,2.179487,1.089744,0.833333,1.282051,6.025641
2,Lucien Agoume,8.6,0.17,0.813953,4.069767,1.046512,0.813953,87.5,68.604651,0.348837,13.953488,88.9,3.604651,73.8,1.046512,4.186047,0.232558,2.43,0.12,2.674419,2.093023,1.046512,0.813953,5.581395
3,Carles Aleñá,11.3,0.25,1.061947,2.035398,3.982301,0.530973,21.4,35.575221,1.238938,10.088496,82.6,3.362832,64.4,1.59292,2.743363,0.353982,3.18,0.18,2.123894,0.707965,0.884956,0.353982,4.690265
4,Sergi Altimira,6.3,0.07,1.587302,4.920635,1.269841,0.952381,54.5,73.809524,0.634921,17.460317,94.8,4.761905,83.3,0.634921,4.126984,0.634921,2.22,0.0,2.380952,1.269841,1.746032,1.428571,6.507937


In [37]:
cenmf_df = cenmf_df.rename(columns={'level_0':'index'})

In [38]:
cenmf_df.reset_index(inplace=True)

In [39]:
cenmf_df.head()

Unnamed: 0,index,Player,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
0,0,Salis Abdul Samed,16.9,0.08,0.532544,4.615385,1.183432,0.414201,36.8,99.585799,1.952663,19.526627,91.7,2.426036,75.9,0.35503,5.147929,0.059172,1.6,0.18,1.242604,0.828402,0.473373,0.710059,5.266272
1,1,Yacine Adli,15.6,0.16,1.923077,8.012821,1.666667,1.410256,68.8,137.435897,1.730769,31.217949,91.2,7.692308,67.8,1.410256,9.551282,1.025641,3.97,0.51,2.179487,1.089744,0.833333,1.282051,6.025641
2,2,Lucien Agoume,8.6,0.17,0.813953,4.069767,1.046512,0.813953,87.5,68.604651,0.348837,13.953488,88.9,3.604651,73.8,1.046512,4.186047,0.232558,2.43,0.12,2.674419,2.093023,1.046512,0.813953,5.581395
3,3,Carles Aleñá,11.3,0.25,1.061947,2.035398,3.982301,0.530973,21.4,35.575221,1.238938,10.088496,82.6,3.362832,64.4,1.59292,2.743363,0.353982,3.18,0.18,2.123894,0.707965,0.884956,0.353982,4.690265
4,4,Sergi Altimira,6.3,0.07,1.587302,4.920635,1.269841,0.952381,54.5,73.809524,0.634921,17.460317,94.8,4.761905,83.3,0.634921,4.126984,0.634921,2.22,0.0,2.380952,1.269841,1.746032,1.428571,6.507937


In [40]:
# Normalization

from sklearn.preprocessing import MinMaxScaler

player_names = cenmf_df['Player'].tolist()

x = cenmf_df.drop(['Player'], axis = 1)



In [41]:
x.head()

Unnamed: 0,index,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
0,0,16.9,0.08,0.532544,4.615385,1.183432,0.414201,36.8,99.585799,1.952663,19.526627,91.7,2.426036,75.9,0.35503,5.147929,0.059172,1.6,0.18,1.242604,0.828402,0.473373,0.710059,5.266272
1,1,15.6,0.16,1.923077,8.012821,1.666667,1.410256,68.8,137.435897,1.730769,31.217949,91.2,7.692308,67.8,1.410256,9.551282,1.025641,3.97,0.51,2.179487,1.089744,0.833333,1.282051,6.025641
2,2,8.6,0.17,0.813953,4.069767,1.046512,0.813953,87.5,68.604651,0.348837,13.953488,88.9,3.604651,73.8,1.046512,4.186047,0.232558,2.43,0.12,2.674419,2.093023,1.046512,0.813953,5.581395
3,3,11.3,0.25,1.061947,2.035398,3.982301,0.530973,21.4,35.575221,1.238938,10.088496,82.6,3.362832,64.4,1.59292,2.743363,0.353982,3.18,0.18,2.123894,0.707965,0.884956,0.353982,4.690265
4,4,6.3,0.07,1.587302,4.920635,1.269841,0.952381,54.5,73.809524,0.634921,17.460317,94.8,4.761905,83.3,0.634921,4.126984,0.634921,2.22,0.0,2.380952,1.269841,1.746032,1.428571,6.507937


In [42]:
x = x.values

scaler = MinMaxScaler()
df_norm = scaler.fit_transform(x)
df_norm = pd.DataFrame(df_norm)
# df_norm.head()

In [43]:
df_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,0.0,0.290244,0.094118,0.099525,0.234701,0.089429,0.138417,0.368,0.355791,0.41386,0.488914,0.857494,0.170315,0.772955,0.072371,0.373764,0.01984,0.135965,0.153846,0.132705,0.254965,0.201622,0.254438,0.411989
1,0.00241,0.258537,0.188235,0.359395,0.502982,0.131708,0.471276,0.688,0.512734,0.364804,0.901478,0.845209,0.655371,0.63773,0.287475,0.760554,0.343891,0.395833,0.435897,0.37778,0.354847,0.354938,0.459402,0.50905
2,0.004819,0.087805,0.2,0.152116,0.191616,0.07745,0.272005,0.875,0.22733,0.059291,0.29225,0.788698,0.278872,0.737896,0.213327,0.289273,0.077975,0.226974,0.102564,0.507246,0.738292,0.445736,0.291667,0.452267
3,0.007229,0.153659,0.294118,0.198462,0.030971,0.334308,0.177439,0.214,0.090376,0.256072,0.155862,0.633907,0.256599,0.580968,0.324711,0.162548,0.118688,0.309211,0.153846,0.363237,0.208934,0.376926,0.126844,0.338365
4,0.009639,0.031707,0.082353,0.296643,0.258805,0.096989,0.318264,0.545,0.248912,0.122537,0.415999,0.933661,0.385463,0.896494,0.129426,0.284085,0.212885,0.203947,0.0,0.43048,0.423679,0.74368,0.511905,0.570696


In [44]:
similarity = cosine_similarity(df_norm)
print(similarity[:10])

[[1.         0.90060284 0.85821416 ... 0.84676497 0.77919244 0.83935495]
 [0.90060284 1.         0.85785452 ... 0.8427954  0.79679128 0.82101827]
 [0.85821416 0.85785452 1.         ... 0.79532646 0.84897506 0.80940246]
 ...
 [0.85748949 0.89187037 0.9355973  ... 0.78289908 0.83873895 0.80936243]
 [0.87160016 0.9186496  0.76545897 ... 0.79803039 0.72609834 0.77143609]
 [0.87256604 0.76503378 0.89397765 ... 0.78370362 0.76977071 0.81884159]]


In [66]:
player_name=input(' Enter player name : ')

player_names = cenmf_df['Player'].tolist()

find_close_match = difflib.get_close_matches(player_name, player_names, cutoff=0.4)

close_match=find_close_match[0]

index_of_player = cenmf_df[cenmf_df['Player']==close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_player]))

sorted_similar_players=sorted(similarity_score,key=lambda x:x[1], reverse=True)

print('Similar players : \n')

i=1

for player in sorted_similar_players:
    index=player[0]
    player_from_index=cenmf_df[cenmf_df['index']==index]['Player'].values[0]
    # sim_score_index = similarity_score.values[0]
    if (i<11):
        print(i,' ',player_from_index,' ',round(similarity_score[index][1]*100, 1),'%')
        i+=1

 Enter player name :  Conor Gallagher


Similar players : 

1   Conor Gallagher   100.0 %
2   James Garner   97.4 %
3   Maxence Caqueret   97.4 %
4   Matheus Henrique   97.3 %
5   Bruno Guimarães   97.0 %
6   Hugo Magnetti   96.9 %
7   Yangel Herrera   96.9 %
8   Douglas Luiz   96.6 %
9   Vitaly Janelt   96.6 %
10   Fran Beltrán   96.4 %


Unnamed: 0,index,Player,No_90s,npxG+xAG_90,Prg_Carr_90,Prg_Pass_90,Prg_Pass_Rec_90,Take-Ons_Succ_90,Take-Ons_Succ%,Carries_PrgDist_90,Carries_1/3_90,Medium_Cmp_90,Medium_Cmp%,Long_Cmp_90,Long_Cmp%,KP_90,Pass_Fin_3rd_90,Pass Types_Sw_90,SCA_90,GCA_90,Tkls_90,Tkls_Won_90,Drib_Tkl_90,Int_90,Ball_Recoveries_90
