Imports

In [1]:
import pandas as pd 
import utils
import chromadb

In [2]:
PATH_TO_FILES = "../data/fbref/cleaned/"
dataframes = utils.get_all_attributes(PATH_TO_FILES)

Query similar players by one characteristic

In [3]:
df = dataframes["shooting"]

df_player_information = df[['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']]

ids = df.index.astype(str).to_list()
attributes = df.drop(columns=['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age'])
attributes = attributes.fillna(0)
embeddings = attributes.values.tolist()

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-db-1",
    metadata={"hnsw:space": "cosine"}
)

collection.add(
    embeddings= embeddings,
    ids=ids
)

target_id = 633

target = attributes.iloc[target_id]
target_embedding = target.values.tolist()


display(df.iloc[target_id])

Season              2022-2023
League             Bundesliga
Team               RB Leipzig
Player     Christopher Nkunku
Nation                    FRA
Pos                     FW,MF
Age                      24.0
90s                      21.1
Gls                      16.0
Sh                       70.0
SoT                      25.0
SoT%                     35.7
Sh/90                    3.32
SoT/90                   1.19
G/Sh                     0.19
G/SoT                    0.52
Dist                     15.9
PK                        3.0
PKatt                     4.0
FK                        7.0
xG                       14.4
npxG                     11.3
npxG/Sh                  0.16
G-xG                      1.6
np:G-xG                   1.7
Name: 633, dtype: object

In [4]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=20
)

query_idx = list(results.get('ids')[0])

distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)
result_df = result_df.set_index('id')

merged_data = pd.merge(df, result_df, left_index=True, right_index=True, how='right')
merged_data.sort_values('similarity',ascending=True)[:20]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,90s,Gls,Sh,...,PK,PKatt,FK,xG,npxG,npxG/Sh,G-xG,np:G-xG,similarity percentage,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
633,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,21.1,16.0,70.0,...,3.0,4.0,7.0,14.4,11.3,0.16,1.6,1.7,99.999994,5.960464e-08
2368,2022-2023,SeriaA,Milan,Olivier Giroud,FRA,FW,35.0,23.8,13.0,77.0,...,3.0,3.0,6.0,12.7,10.4,0.14,0.3,-0.4,99.63879,0.003612101
3673,2022-2023,PrimeiraLiga,Boavista,Yusupha Njie,GAM,FW,28.0,23.6,13.0,71.0,...,1.0,3.0,1.0,13.1,10.7,0.16,-0.1,1.3,99.589998,0.004100025
220,2022-2023,EPL,Brentford,Ivan Toney,ENG,FW,26.0,32.8,20.0,87.0,...,6.0,7.0,9.0,18.7,13.2,0.15,1.3,0.8,99.544567,0.004554331
2456,2022-2023,SeriaA,Juventus,Dušan Vlahović,SRB,FW,22.0,21.4,10.0,65.0,...,2.0,3.0,4.0,9.7,7.3,0.11,0.3,0.7,99.503601,0.004963994
3104,2022-2023,EreDivisie,Heerenveen,Sydney van Hooijdonk,NED,FW,22.0,28.9,16.0,90.0,...,1.0,1.0,7.0,14.0,13.1,0.15,2.0,1.9,99.388719,0.006112814
3448,2022-2023,PrimeiraLiga,Porto,Mehdi Taremi,IRN,FW,30.0,29.0,22.0,88.0,...,7.0,8.0,3.0,20.1,13.8,0.16,1.9,1.2,99.37945,0.006205499
2919,2022-2023,EreDivisie,PSV Eindhoven,Xavi Simons,NED,"FW,MF",19.0,31.1,19.0,91.0,...,2.0,2.0,0.0,14.9,13.3,0.15,4.1,3.7,99.2688,0.007312
2095,2022-2023,Ligue1,Strasbourg,Habib Diallo,SEN,FW,27.0,32.0,20.0,90.0,...,3.0,4.0,1.0,17.8,14.5,0.17,2.2,2.5,99.266559,0.007334411
736,2022-2023,Bundesliga,Eint Frankfurt,Randal Kolo Muani,FRA,FW,23.0,29.2,15.0,75.0,...,2.0,2.0,0.0,13.2,11.6,0.16,1.8,1.4,99.242407,0.007575929


Query similar players by two characteristic

In [5]:
df1 = dataframes["shooting"]
df2 = dataframes["possession"]

df = pd.merge(df1, df2, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])

df_player_information = df[['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']]

ids = df.index.astype(str).to_list()
attributes = df.drop(columns=['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age'])
attributes = attributes.fillna(0)
embeddings = attributes.values.tolist()

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-db-2",
    metadata={"hnsw:space": "cosine"}
)

collection.add(
    embeddings= embeddings,
    ids=ids
)

target_id = 633

target = attributes.iloc[target_id]
target_embedding = target.values.tolist()


display(df.iloc[target_id])

Season                      2022-2023
League                     Bundesliga
Team                       RB Leipzig
Player             Christopher Nkunku
Nation                            FRA
Pos                             FW,MF
Age                              24.0
90s                              21.1
Gls                              16.0
Sh                               70.0
SoT                              25.0
SoT%                             35.7
Sh/90                            3.32
SoT/90                           1.19
G/Sh                             0.19
G/SoT                            0.52
Dist                             15.9
PK                                3.0
PKatt                             4.0
FK                                7.0
xG                               14.4
npxG                             11.3
npxG/Sh                          0.16
G-xG                              1.6
np:G-xG                           1.7
Touches_Touches                 854.0
Touches_Def 

In [6]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=20
)

query_idx = list(results.get('ids')[0])

distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)
result_df = result_df.set_index('id')

merged_data = pd.merge(df, result_df, left_index=True, right_index=True, how='right')
merged_data.sort_values('similarity',ascending=True)[:20]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,90s,Gls,Sh,...,Carries_PrgDist,Carries_PrgC,Carries_1/3,Carries_CPA,Carries_Mis,Carries_Dis,Receiving_Rec,Receiving_PrgR,similarity percentage,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
633,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,21.1,16.0,70.0,...,1225.0,43.0,23.0,23.0,45.0,38.0,672.0,176.0,100.0,0.0
634,2022-2023,Bundesliga,RB Leipzig,André Silva,POR,FW,26.0,19.3,4.0,60.0,...,984.0,35.0,21.0,23.0,65.0,43.0,556.0,142.0,99.968803,0.000312
15,2022-2023,EPL,Manchester City,Julián Álvarez,ARG,"FW,MF",22.0,16.3,9.0,46.0,...,908.0,38.0,18.0,17.0,28.0,24.0,506.0,99.0,99.964392,0.000356
2633,2022-2023,SeriaA,Sassuolo,Domenico Berardi,ITA,FW,27.0,20.7,12.0,75.0,...,1586.0,57.0,29.0,31.0,60.0,23.0,905.0,213.0,99.953943,0.000461
1380,2022-2023,LaLiga,Rayo Vallecano,Sergio Camello,ESP,FW,21.0,26.7,6.0,76.0,...,1053.0,40.0,36.0,17.0,71.0,35.0,561.0,122.0,99.945623,0.000544
186,2022-2023,EPL,Tottenham,Harry Kane,ENG,FW,29.0,37.8,30.0,124.0,...,1842.0,60.0,50.0,21.0,74.0,58.0,1031.0,193.0,99.938196,0.000618
34,2022-2023,EPL,Arsenal,Gabriel Jesus,BRA,FW,25.0,22.9,11.0,76.0,...,1517.0,44.0,31.0,28.0,88.0,65.0,799.0,194.0,99.936557,0.000634
3857,2022-2023,PrimeiraLiga,Portimonense,Welinton Júnior,BRA,FW,29.0,22.9,6.0,93.0,...,1022.0,40.0,30.0,13.0,93.0,40.0,558.0,153.0,99.932992,0.00067
491,2022-2023,EPL,Leicester City,Kelechi Iheanacho,NGA,"FW,MF",25.0,12.5,5.0,38.0,...,724.0,33.0,13.0,18.0,41.0,36.0,380.0,99.0,99.931538,0.000685
2321,2022-2023,SeriaA,Lazio,Ciro Immobile,ITA,FW,32.0,24.7,12.0,72.0,...,1134.0,40.0,21.0,27.0,63.0,22.0,695.0,174.0,99.92612,0.000739


In [7]:
shooting = dataframes["shooting"]
possession = dataframes["possession"]
g_s_creation = dataframes["goal_and_shot_creation"]

s_p_merge = pd.merge(shooting, possession, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])
df = pd.merge(s_p_merge, g_s_creation, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])

len(df.columns), len(shooting.columns), len(possession.columns), len(g_s_creation.columns)

(63, 25, 30, 24)

Query similar players by three characteristic

In [8]:
from sklearn.preprocessing import RobustScaler, StandardScaler
shooting = dataframes["shooting"]
possession = dataframes["possession"]
g_s_creation = dataframes["goal_and_shot_creation"]

s_p_merge = pd.merge(shooting, possession, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])
df = pd.merge(s_p_merge, g_s_creation, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])

df_player_information = df[['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']]

ids = df.index.astype(str).to_list()
attributes = df.drop(columns=['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age'])
attributes = attributes.fillna(0)

embeddings = attributes.values.tolist()

#scaler = StandardScaler()
#scaled_data  = scaler.fit_transform(attributes)
#attributes = pd.DataFrame(scaled_data, columns=attributes.columns)
#embeddings = attributes.values.tolist()

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-db-3",
    metadata={"hnsw:space": "cosine"}
)

collection.add(
    embeddings= embeddings,
    ids=ids
)

target_id = 633

target = attributes.iloc[target_id]
target_embedding = target.values.tolist()


display(df.iloc[target_id])

Season                         2022-2023
League                        Bundesliga
Team                          RB Leipzig
Player                Christopher Nkunku
Nation                               FRA
                             ...        
GCA Types_PassDead                   0.0
GCA Types_TO                         3.0
GCA Types_Sh                         1.0
GCA Types_Fld                        3.0
GCA Types_Def                        0.0
Name: 633, Length: 63, dtype: object

In [9]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=20
)

query_idx = list(results.get('ids')[0])

distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)
result_df = result_df.set_index('id')

merged_data = pd.merge(df, result_df, left_index=True, right_index=True, how='right')
merged_data.sort_values('similarity',ascending=True)[:20]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,90s,Gls,Sh,...,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,similarity percentage,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
633,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,21.1,16.0,70.0,...,14.0,0.66,7.0,0.0,3.0,1.0,3.0,0.0,100.000024,-2.384186e-07
1851,2022-2023,Ligue1,Lyon,Alexandre Lacazette,FRA,FW,31.0,32.5,27.0,107.0,...,11.0,0.34,8.0,0.0,0.0,0.0,3.0,0.0,89.014244,0.1098576
186,2022-2023,EPL,Tottenham,Harry Kane,ENG,FW,29.0,37.8,30.0,124.0,...,18.0,0.48,10.0,0.0,2.0,3.0,3.0,0.0,88.333374,0.1166663
736,2022-2023,Bundesliga,Eint Frankfurt,Randal Kolo Muani,FRA,FW,23.0,29.2,15.0,75.0,...,22.0,0.75,13.0,0.0,4.0,2.0,3.0,0.0,87.986583,0.1201342
1738,2022-2023,Ligue1,Marseille,Alexis Sánchez,CHI,"FW,MF",33.0,29.8,14.0,63.0,...,11.0,0.37,5.0,0.0,1.0,3.0,2.0,0.0,87.763685,0.1223632
2402,2022-2023,SeriaA,Atalanta,Ademola Lookman,NGA,"FW,MF",24.0,19.2,13.0,49.0,...,12.0,0.62,8.0,0.0,1.0,2.0,1.0,0.0,86.769909,0.1323009
87,2022-2023,EPL,Newcastle Utd,Alexander Isak,SWE,FW,22.0,16.9,10.0,49.0,...,5.0,0.3,1.0,0.0,2.0,1.0,1.0,0.0,86.406487,0.1359351
15,2022-2023,EPL,Manchester City,Julián Álvarez,ARG,"FW,MF",22.0,16.3,9.0,46.0,...,5.0,0.31,2.0,0.0,1.0,1.0,1.0,0.0,86.292255,0.1370775
3477,2022-2023,PrimeiraLiga,Braga,Iuri Medeiros,POR,"MF,FW",28.0,22.9,10.0,80.0,...,18.0,0.79,11.0,0.0,3.0,3.0,1.0,0.0,86.231893,0.1376811
220,2022-2023,EPL,Brentford,Ivan Toney,ENG,FW,26.0,32.8,20.0,87.0,...,8.0,0.24,3.0,0.0,1.0,0.0,4.0,0.0,85.941011,0.1405899


In [10]:
df1 = dataframes["shooting"]
df2 = dataframes["possession"]

df = df1
#df = df2
#df = pd.merge(df1, df2, on=['Player', 'Season', 'League', 'Team', 'Nation', 'Pos', 'Age', '90s'])

### VDB Test

In [11]:
df_player_information = df[['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']]


In [12]:
from sklearn.preprocessing import RobustScaler, StandardScaler

ids = df.index.astype(str).to_list()
attributes = df.drop(columns=['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age'])
attributes = attributes.fillna(0)

# dont scale attributes
embeddings = attributes.values.tolist()

# scale attributes
#scaler = StandardScaler()
#scaled_data  = scaler.fit_transform(attributes)
#attributes = pd.DataFrame(scaled_data, columns=attributes.columns)
#embeddings = attributes.values.tolist()
#len(embeddings), attributes.shape

In [13]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-vector-database",
    metadata={"hnsw:space": "cosine"}
)

In [14]:
collection.add(
    embeddings= embeddings,
    ids=ids
)

In [15]:
### custom target
target_id = 633

target = attributes.iloc[target_id]
target_embedding = target.values.tolist()


display(df.iloc[target_id])


Season              2022-2023
League             Bundesliga
Team               RB Leipzig
Player     Christopher Nkunku
Nation                    FRA
Pos                     FW,MF
Age                      24.0
90s                      21.1
Gls                      16.0
Sh                       70.0
SoT                      25.0
SoT%                     35.7
Sh/90                    3.32
SoT/90                   1.19
G/Sh                     0.19
G/SoT                    0.52
Dist                     15.9
PK                        3.0
PKatt                     4.0
FK                        7.0
xG                       14.4
npxG                     11.3
npxG/Sh                  0.16
G-xG                      1.6
np:G-xG                   1.7
Name: 633, dtype: object

In [16]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=20
)

query_idx = list(results.get('ids')[0])

In [17]:
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)
result_df = result_df.set_index('id')

merged_data = pd.merge(df, result_df, left_index=True, right_index=True, how='right')
merged_data.sort_values('similarity',ascending=True)[:20]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,90s,Gls,Sh,...,PK,PKatt,FK,xG,npxG,npxG/Sh,G-xG,np:G-xG,similarity percentage,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
633,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,21.1,16.0,70.0,...,3.0,4.0,7.0,14.4,11.3,0.16,1.6,1.7,99.999994,5.960464e-08
2368,2022-2023,SeriaA,Milan,Olivier Giroud,FRA,FW,35.0,23.8,13.0,77.0,...,3.0,3.0,6.0,12.7,10.4,0.14,0.3,-0.4,99.63879,0.003612101
3673,2022-2023,PrimeiraLiga,Boavista,Yusupha Njie,GAM,FW,28.0,23.6,13.0,71.0,...,1.0,3.0,1.0,13.1,10.7,0.16,-0.1,1.3,99.589998,0.004100025
220,2022-2023,EPL,Brentford,Ivan Toney,ENG,FW,26.0,32.8,20.0,87.0,...,6.0,7.0,9.0,18.7,13.2,0.15,1.3,0.8,99.544567,0.004554331
2456,2022-2023,SeriaA,Juventus,Dušan Vlahović,SRB,FW,22.0,21.4,10.0,65.0,...,2.0,3.0,4.0,9.7,7.3,0.11,0.3,0.7,99.503601,0.004963994
3104,2022-2023,EreDivisie,Heerenveen,Sydney van Hooijdonk,NED,FW,22.0,28.9,16.0,90.0,...,1.0,1.0,7.0,14.0,13.1,0.15,2.0,1.9,99.388719,0.006112814
3448,2022-2023,PrimeiraLiga,Porto,Mehdi Taremi,IRN,FW,30.0,29.0,22.0,88.0,...,7.0,8.0,3.0,20.1,13.8,0.16,1.9,1.2,99.37945,0.006205499
2919,2022-2023,EreDivisie,PSV Eindhoven,Xavi Simons,NED,"FW,MF",19.0,31.1,19.0,91.0,...,2.0,2.0,0.0,14.9,13.3,0.15,4.1,3.7,99.2688,0.007312
2095,2022-2023,Ligue1,Strasbourg,Habib Diallo,SEN,FW,27.0,32.0,20.0,90.0,...,3.0,4.0,1.0,17.8,14.5,0.17,2.2,2.5,99.266559,0.007334411
736,2022-2023,Bundesliga,Eint Frankfurt,Randal Kolo Muani,FRA,FW,23.0,29.2,15.0,75.0,...,2.0,2.0,0.0,13.2,11.6,0.16,1.8,1.4,99.242407,0.007575929
