In [4]:
import pandas as pd
import chromadb

#### Load
Preprocessed Data

In [5]:
df = pd.read_csv("../data/2024/cleaned_data.csv", sep=';')


Player Information

In [6]:
data = pd.read_csv('../data/2024/male_players.csv')
data = data[data['fifa_version'] == 24.0]
player_information = ['player_id', 'short_name','club_name' ,'age', 'overall', 'player_positions','preferred_foot' , 'nationality_name','value_eur','wage_eur', ]
df_player_information = data[player_information]
display(df_player_information)

  data = pd.read_csv('../data/2024/male_players.csv')


Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur
0,231747,K. Mbappé,Paris Saint Germain,24,91,"ST, LW",Right,France,181500000.0,230000.0
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0
2,192985,K. De Bruyne,Manchester City,32,91,"CM, CAM",Right,Belgium,103000000.0,350000.0
3,158023,L. Messi,Inter Miami,36,90,"CF, CAM",Left,Argentina,41000000.0,23000.0
4,165153,K. Benzema,Al Ittihad,35,90,"CF, ST",Right,France,51000000.0,95000.0
...,...,...,...,...,...,...,...,...,...,...
18345,273759,S. Telem,Jamshedpur,20,47,CB,Left,India,100000.0,500.0
18346,269013,Jin Liangkuan,Meizhou Hakka,21,47,CB,Right,China PR,100000.0,1000.0
18347,272748,M. Mewlan,Shandong Taishan,19,47,ST,Right,China PR,100000.0,2000.0
18348,269534,Yin Jie,Zhejiang,21,47,CM,Right,China PR,90000.0,1000.0


#### Prepare Data and get embeddings

In [7]:
ids = df['player_id'].astype(str).tolist()
df_cleaned = df.drop(columns=['player_id', 'short_name'])
embeddings = df_cleaned.values.tolist()

#### Vector Database

In [8]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player_database",
    metadata={"hnsw:space": "cosine"}
)

#### Add items

In [9]:
collection.add(
    embeddings= embeddings,
    #metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=ids
)

#### Query

In [10]:
### custom target
target = df[df['player_id'] == 239085]

# prepare target
target_id = target['player_id'].iloc[0]
target_cleaned = target.drop(columns=['player_id', 'short_name'])
target_embedding = target_cleaned.values.tolist()

In [11]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=len(df_player_information)
)
#

# display query object
print("Queried Player")
display(df_player_information[df_player_information['player_id'] == target_id])

query_idx = list(results.get('ids')[0])

# output player names
"""
print("Similar players")

for idx in query_idx:
    id_int = int(idx)
    display(df_player_information[df_player_information['player_id'] == id_int])
"""


Number of requested results 18350 is greater than number of elements in index 18263, updating n_results = 18263


Queried Player


Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0


'\nprint("Similar players")\n\nfor idx in query_idx:\n    id_int = int(idx)\n    display(df_player_information[df_player_information[\'player_id\'] == id_int])\n'

In [12]:
# calculate percentage of similarity
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)
    #print(f"Similarity percentage with base query item: {((1 - dist) * 100):.2f}%")

result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}


In [13]:
result_df = pd.DataFrame.from_dict(result_dic)
result_df


Unnamed: 0,id,similarity percentage,similarity
0,239085,99.999982,1.788139e-07
1,192387,98.930234,1.069766e-02
2,192505,98.686141,1.313859e-02
3,232293,98.369312,1.630688e-02
4,188545,98.358369,1.641631e-02
...,...,...,...
18258,277587,40.942222,5.905778e-01
18259,275661,40.362579,5.963742e-01
18260,247670,40.026224,5.997378e-01
18261,259731,39.847600,6.015240e-01


In [14]:
result_df['id'] = result_df['id'].astype(int)
merged_data = pd.merge(df_player_information, result_df, left_on='player_id', right_on='id', how='left').drop(columns=['id'])


In [15]:
merged_data.isna().sum()

player_id                  0
short_name                 0
club_name                 87
age                        0
overall                    0
player_positions           0
preferred_foot             0
nationality_name           0
value_eur                100
wage_eur                  87
similarity percentage     87
similarity                87
dtype: int64

In [16]:
top_players = merged_data.sort_values('similarity percentage',ascending=False)
display(top_players[top_players['wage_eur']<60000][:10])

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,similarity percentage,similarity
1061,226537,V. Janssen,Antwerp,29,77,ST,Left,Netherlands,10500000.0,35000.0,98.107916,0.018921
925,231628,Rafa Mir,Sevilla,26,77,ST,Right,Spain,14000000.0,29000.0,98.092079,0.019079
721,222357,B. Embolo,Monaco,26,78,ST,Right,Switzerland,17500000.0,54000.0,97.964162,0.020358
551,216549,A. Sørloth,Villarreal,27,79,ST,Left,Norway,20500000.0,40000.0,97.848874,0.021511
196,224179,Borja Iglesias,Real Betis,30,83,ST,Right,Spain,35500000.0,41000.0,97.822863,0.021771
236,235410,Y. En-Nesyri,Sevilla,26,82,ST,Left,Morocco,36500000.0,42000.0,97.777992,0.02222
223,223197,E. Ünal,Getafe,26,82,ST,Right,Turkey,38000000.0,44000.0,97.648412,0.023516
1365,229071,K. Onisiwo,FSV Mainz 05,31,76,"ST, CF",Right,Austria,6500000.0,29000.0,97.612137,0.023879
432,184200,M. Arnautović,Inter,34,80,ST,Right,Austria,10500000.0,44000.0,97.603804,0.023962
695,237321,M. Berisha,TSG Hoffenheim,25,78,"ST, CF",Right,Germany,20000000.0,38000.0,97.56881,0.024312


#### TODO
- Similarity Comparison to FC Bayern IV

#### Learnings
- Position kann manchmal Ärger machen
- Alter sollte ein Filter sein
- Empfehlungen sind nicht immer absehbar