In [1]:
import pandas as pd
import chromadb

#### Load
Preprocessed Data

In [2]:
df = pd.read_csv("../data/2024/player_attributes.csv", sep=';')
df_player_information = pd.read_csv("../data/2024/player_information.csv", sep=';')

display(df_player_information)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
0,231747,K. Mbappé,Paris Saint Germain,24,91,"ST, LW",Right,France,181.5 mio,230.0 k,5
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185.0 mio,340.0 k,5
2,192985,K. De Bruyne,Manchester City,32,91,"CM, CAM",Right,Belgium,103.0 mio,350.0 k,5
3,158023,L. Messi,Inter Miami,36,90,"CF, CAM",Left,Argentina,41.0 mio,23.0 k,5
4,165153,K. Benzema,Al Ittihad,35,90,"CF, ST",Right,France,51.0 mio,95.0 k,5
...,...,...,...,...,...,...,...,...,...,...,...
18258,273759,S. Telem,Jamshedpur,20,47,CB,Left,India,100.0 k,1499,1
18259,269013,Jin Liangkuan,Meizhou Hakka,21,47,CB,Right,China PR,100.0 k,1999,1
18260,272748,M. Mewlan,Shandong Taishan,19,47,ST,Right,China PR,100.0 k,2999,1
18261,269534,Yin Jie,Zhejiang,21,47,CM,Right,China PR,90.0 k,1999,1


#### Prepare Data and get embeddings

In [3]:
ids = df['player_id'].astype(str).tolist()
df_cleaned = df.drop(columns=['player_id', 'short_name'])
embeddings = df_cleaned.values.tolist()

len(embeddings), df_cleaned.shape

(18263, (18263, 75))

#### Vector Database

In [4]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-vector-database",
    metadata={"hnsw:space": "cosine"}
)

#### Add items

In [5]:
collection.add(
    embeddings= embeddings,
    #metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=ids
)

#### Query

In [6]:
### custom target
target = df[df['player_id'] == 241852]

# prepare target
target_id = target['player_id'].iloc[0]
target_cleaned = target.drop(columns=['player_id', 'short_name'])
target_embedding = target_cleaned.values.tolist()

# display query object
print("Query Player")
display(df_player_information[df_player_information['player_id'] == target_id])


Query Player


Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
94,241852,M. Diaby,Aston Villa,23,84,"RM, LM",Left,France,60.5 mio,105.0 k,3


fix n_results error

In [7]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=18262
)

query_idx = list(results.get('ids')[0])

# output player names
"""
print("Similar players")

for idx in query_idx:
    id_int = int(idx)
    display(df_player_information[df_player_information['player_id'] == id_int])
"""


'\nprint("Similar players")\n\nfor idx in query_idx:\n    id_int = int(idx)\n    display(df_player_information[df_player_information[\'player_id\'] == id_int])\n'

In [8]:
# calculate percentage of similarity
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)
    #print(f"Similarity percentage with base query item: {((1 - dist) * 100):.2f}%")

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}


result_df = pd.DataFrame.from_dict(result_dic)
result_df

result_df['id'] = result_df['id'].astype(int)
merged_data = pd.merge(df_player_information, result_df, left_on='player_id', right_on='id', how='left').drop(columns=['id'])


In [9]:
merged_data.isna().sum()

player_id                   0
short_name                  0
club_name                   0
age                         0
overall                     0
player_positions            0
preferred_foot              0
nationality_name            0
value_eur                   0
wage_eur                    0
international_reputation    0
similarity percentage       1
similarity                  1
dtype: int64

In [10]:
top_players = merged_data.sort_values('similarity',ascending=True)

display(top_players)
#display(top_players.loc[(top_players['value_eur']<8000000) & (top_players['age'] < 23)][:10])

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
94,241852,M. Diaby,Aston Villa,23,84,"RM, LM",Left,France,60.5 mio,105.0 k,3,100.000024,-2.384186e-07
1731,227055,Gelson Martins,Monaco,28,75,"RM, LM",Right,Portugal,5.5 mio,43.0 k,3,98.513985,1.486015e-02
2298,248231,H. López Muñoz,Godoy Cruz,22,73,"CAM, RM, LM",Left,Argentina,7.0 mio,8999,1,98.508936,1.491064e-02
706,232639,R. Doan,SC Freiburg,25,78,"RM, CAM",Left,Japan,18.5 mio,29.0 k,2,98.472679,1.527321e-02
923,234612,J. Ikoné,Fiorentina,25,77,"RW, RM",Left,France,14.0 mio,51.0 k,3,98.467147,1.532853e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18249,275474,Liu Jun,Qingdao Hainiu,33,48,GK,Right,China PR,15.0 k,1999,1,46.517223,5.348278e-01
18156,241317,Xue Qinghao,Nantong Zhiyun,22,50,GK,Right,China PR,60.0 k,1999,1,46.438962,5.356104e-01
18225,267946,Lim Jun Sub,Jeju United,19,48,GK,Right,Korea Republic,100.0 k,1499,1,46.349812,5.365019e-01
13696,274621,J. Salvá,Patronato,36,62,GK,Right,Argentina,45.0 k,1999,1,46.243656,5.375634e-01


#### TODO
- Similarity Comparison to FC Bayern IV

#### Learnings
- Position kann manchmal Ärger machen
- Alter sollte ein Filter sein
- Empfehlungen sind nicht immer absehbar