In [1]:
import pandas as pd
import chromadb
import utils

#### Load
Preprocessed Data

In [2]:
df = pd.read_csv("../data/2024/player_attributes.csv", sep=';')
df_player_information = pd.read_csv("../data/2024/player_information.csv", sep=';')

display(df_player_information)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
0,231747,K. Mbappé,Paris Saint Germain,24,91,"ST, LW",Right,France,181500000.0,230000.0,5
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0,5
2,192985,K. De Bruyne,Manchester City,32,91,"CM, CAM",Right,Belgium,103000000.0,350000.0,5
3,158023,L. Messi,Inter Miami,36,90,"CF, CAM",Left,Argentina,41000000.0,23000.0,5
4,165153,K. Benzema,Al Ittihad,35,90,"CF, ST",Right,France,51000000.0,95000.0,5
...,...,...,...,...,...,...,...,...,...,...,...
18258,273759,S. Telem,Jamshedpur,20,47,CB,Left,India,100000.0,500.0,1
18259,269013,Jin Liangkuan,Meizhou Hakka,21,47,CB,Right,China PR,100000.0,1000.0,1
18260,272748,M. Mewlan,Shandong Taishan,19,47,ST,Right,China PR,100000.0,2000.0,1
18261,269534,Yin Jie,Zhejiang,21,47,CM,Right,China PR,90000.0,1000.0,1


#### Prepare Data and get embeddings

In [3]:
ids = df['player_id'].astype(str).tolist()
df_cleaned = df.drop(columns=['player_id', 'short_name'])
embeddings = df_cleaned.values.tolist()

len(embeddings), df_cleaned.shape

(18263, (18263, 75))

#### Vector Database

In [4]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-vector-database",
    metadata={"hnsw:space": "cosine"}
)

#### Add items

In [5]:
collection.add(
    embeddings= embeddings,
    #metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=ids
)

#### Query

In [6]:
### custom target
target = df[df['player_id'] == 241852]

# prepare target
target_id = target['player_id'].iloc[0]
target_cleaned = target.drop(columns=['player_id', 'short_name'])
target_embedding = target_cleaned.values.tolist()

# display query object
print("Query Player")
display(df_player_information[df_player_information['player_id'] == target_id])


Query Player


Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
94,241852,M. Diaby,Aston Villa,23,84,"RM, LM",Left,France,60500000.0,105000.0,3


In [7]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=18263
)

query_idx = list(results.get('ids')[0])

Evaluate Query Results

In [8]:
# calculate percentage of similarity
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)
    #print(f"Similarity percentage with base query item: {((1 - dist) * 100):.2f}%")

# store the results and create result dataframe
result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)

# merge cosine similarites with player information
merged_data = pd.merge(df_player_information, result_df, left_on='player_id', right_on='id', how='left').drop(columns=['id'])
print(merged_data.columns)


Index(['player_id', 'short_name', 'club_name', 'age', 'overall',
       'player_positions', 'preferred_foot', 'nationality_name', 'value_eur',
       'wage_eur', 'international_reputation', 'similarity percentage',
       'similarity'],
      dtype='object')


In [10]:
# Sort players 
top_players = merged_data.sort_values('similarity',ascending=True)
#top_players['value_eur'] = top_players['value_eur'].apply(utils.adjust_money_appearance)
#top_players['wage_eur'] = top_players['wage_eur'].apply(utils.adjust_money_appearance)



# output the results
output_df = top_players.loc[(top_players['value_eur']<8000000) & (top_players['age'] < 23)][:10]
output_df['value_eur'] = output_df['value_eur'].apply(utils.adjust_money_appearance)
output_df['wage_eur'] = output_df['wage_eur'].apply(utils.adjust_money_appearance)
display(output_df)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
2298,248231,H. López Muñoz,Godoy Cruz,22,73,"CAM, RM, LM",Left,Argentina,€7.0 Mio,€8000,1,98.508936,0.014911
4434,258284,T. Pozzo,Independiente,22,70,"RM, LM, ST",Left,Argentina,€3.2 Mio,€7000,1,97.859746,0.021403
2306,260779,S. Adingra,Brighton & Hove Albion,21,73,"LM, ST",Right,Côte d'Ivoire,€7.0 Mio,€40.0k,1,97.580665,0.024193
3584,253036,N. Mbuku,FC Augsburg,21,71,"CF, RM, CAM",Left,France,€4.2 Mio,€14.0k,1,97.539109,0.024609
3640,252929,Borja Sainz,Norwich City,22,71,"LM, RM",Right,Spain,€3.8 Mio,€19.0k,1,97.418362,0.025816
6196,259011,Zito Luvumbo,Cagliari,21,68,"CF, ST, LM",Left,Angola,€3.0 Mio,€8000,1,97.383815,0.026162
2281,260601,E. Zeballos,Boca Juniors,21,73,"RM, RW, LW",Right,Argentina,€7.0 Mio,€12.0k,1,97.352576,0.026474
2309,243134,L. Ramazani,Almería,22,73,"RM, LM, ST",Right,Belgium,€7.0 Mio,€18.0k,2,97.294736,0.027053
1869,259139,David Costa,Lens,22,74,LW,Left,Portugal,€7.0 Mio,€28.0k,1,97.202176,0.027978
2300,251954,C. Summerville,Leeds United,21,73,"RM, LM",Right,Netherlands,€7.0 Mio,€25.0k,1,97.192585,0.028074


In [44]:
# filter params: df, age_younger, age_older, age, wage_less, wage_more, value_less, value_more
def filter(df,top_n=10, age=None, age_younger=None, age_older=None, value_less=None,value_more=None, wage=None):
    
    if(age_younger):
        df = df.loc[(df['age'] <= age_younger)][:top_n]

    if(age_younger and value_less):
        df = df.loc[(df['value_eur']<=value_less) & (df['age'] <= age_younger)][:top_n]

    if(age_younger and value_more):
        df = df.loc[(df['value_eur']>=value_more) & (df['age'] <= age_younger)][:top_n]

    if(age_older):
        df = df.loc[(df['age'] >= age_older)][:top_n]

    if(age_older and value_less):
        df = df.loc[(df['value_eur']<value_less) & (df['age'] >= age_older)][:top_n]

    if(age_older and value_more):
        df = df.loc[(df['value_eur']>=value_more) & (df['age'] >= age_older)][:top_n]

        
    # prepare return df
    df['value_eur'] = df['value_eur'].apply(utils.adjust_money_appearance)
    df['wage_eur'] = df['wage_eur'].apply(utils.adjust_money_appearance)
    return df


filter(top_players, age_older=24, value_less= 10000000)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
1731,227055,Gelson Martins,Monaco,28,75,"RM, LM",Right,Portugal,€5.5 Mio,€43.0k,3,98.513985,0.01486
3498,239048,H. Akbunar,Pendikspor,29,72,"RM, LM",Left,Turkey,€2.3 Mio,€12.0k,1,98.353511,0.016465


#### TODO
- Similarity Comparison to FC Bayern IV

#### Learnings
- Position kann manchmal Ärger machen
- Alter sollte ein Filter sein
- Empfehlungen sind nicht immer absehbar