In [1]:
import pandas as pd 
import utils


In [2]:
PATH_TO_FILES = "../data/fbref/cleaned/"

dataframes = utils.get_all_attributes(PATH_TO_FILES)
df = dataframes["shooting"]

In [3]:
df.columns

Index(['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age', '90s',
       'Gls', 'Sh', 'SoT', 'SoT%', 'Sh/90', 'SoT/90', 'G/Sh', 'G/SoT', 'Dist',
       'PK', 'PKatt', 'FK', 'xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG'],
      dtype='object')

### VDB Test

In [4]:
df_player_information = df[['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age']]


In [5]:
from sklearn.preprocessing import RobustScaler

ids = df.index.astype(str).to_list()
attributes = df.drop(columns=['Season', 'League', 'Team', 'Player', 'Nation', 'Pos', 'Age'])
attributes = attributes.fillna(0)

# dont scale attributes
# embeddings = attributes.values.tolist()

# scale attributes
scaler = RobustScaler()
scaled_data  = scaler.fit_transform(attributes)
attributes = pd.DataFrame(scaled_data, columns=attributes.columns)
embeddings = attributes.values.tolist()
#len(embeddings), attributes.shape

In [6]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-vector-database",
    metadata={"hnsw:space": "cosine"}
)

In [7]:
collection.add(
    embeddings= embeddings,
    ids=ids
)

In [8]:
### custom target
target_id = 633

target = attributes.iloc[target_id]
target_embedding = target.values.tolist()


display(df.iloc[target_id])


Season              2022-2023
League             Bundesliga
Team               RB Leipzig
Player     Christopher Nkunku
Nation                    FRA
Pos                     FW,MF
Age                      24.0
90s                      21.1
Gls                      16.0
Sh                       70.0
SoT                      25.0
SoT%                     35.7
Sh/90                    3.32
SoT/90                   1.19
G/Sh                     0.19
G/SoT                    0.52
Dist                     15.9
PK                        3.0
PKatt                     4.0
FK                        7.0
xG                       14.4
npxG                     11.3
npxG/Sh                  0.16
G-xG                      1.6
np:G-xG                   1.7
Name: 633, dtype: object

In [9]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=10
)

query_idx = list(results.get('ids')[0])

In [10]:
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)
result_df = result_df.set_index('id')

merged_data = pd.merge(df, result_df, left_index=True, right_index=True, how='right')
merged_data.sort_values('similarity',ascending=True)[:10]

Unnamed: 0_level_0,Season,League,Team,Player,Nation,Pos,Age,90s,Gls,Sh,...,PK,PKatt,FK,xG,npxG,npxG/Sh,G-xG,np:G-xG,similarity percentage,similarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
633,2022-2023,Bundesliga,RB Leipzig,Christopher Nkunku,FRA,"FW,MF",24.0,21.1,16.0,70.0,...,3.0,4.0,7.0,14.4,11.3,0.16,1.6,1.7,99.999988,1.192093e-07
1738,2022-2023,Ligue1,Marseille,Alexis Sánchez,CHI,"FW,MF",33.0,29.8,14.0,63.0,...,2.0,3.0,5.0,12.6,10.2,0.17,1.4,1.8,99.016607,0.009833932
1851,2022-2023,Ligue1,Lyon,Alexandre Lacazette,FRA,FW,31.0,32.5,27.0,107.0,...,6.0,8.0,8.0,24.2,17.9,0.17,2.8,3.1,98.495114,0.01504886
2456,2022-2023,SeriaA,Juventus,Dušan Vlahović,SRB,FW,22.0,21.4,10.0,65.0,...,2.0,3.0,4.0,9.7,7.3,0.11,0.3,0.7,98.426652,0.01573348
1437,2022-2023,LaLiga,Celta Vigo,Iago Aspas,ESP,FW,34.0,31.9,12.0,75.0,...,2.0,3.0,7.0,11.1,8.7,0.12,0.9,1.3,98.395449,0.01604551
220,2022-2023,EPL,Brentford,Ivan Toney,ENG,FW,26.0,32.8,20.0,87.0,...,6.0,7.0,9.0,18.7,13.2,0.15,1.3,0.8,97.897023,0.02102977
821,2022-2023,Bundesliga,M'Gladbach,Ramy Bensebaini,ALG,DF,27.0,27.6,6.0,39.0,...,2.0,2.0,3.0,5.3,3.7,0.1,0.7,0.3,96.908873,0.03091127
3104,2022-2023,EreDivisie,Heerenveen,Sydney van Hooijdonk,NED,FW,22.0,28.9,16.0,90.0,...,1.0,1.0,7.0,14.0,13.1,0.15,2.0,1.9,96.803904,0.03196096
961,2022-2023,Bundesliga,Augsburg,Mergim Berisha,GER,FW,24.0,18.7,9.0,51.0,...,3.0,4.0,6.0,8.1,5.0,0.1,0.9,1.0,96.101236,0.03898764
2368,2022-2023,SeriaA,Milan,Olivier Giroud,FRA,FW,35.0,23.8,13.0,77.0,...,3.0,3.0,6.0,12.7,10.4,0.14,0.3,-0.4,96.095943,0.03904057


In [11]:
1 - 0.007324

0.992676