In [1]:
import pandas as pd
import chromadb
import utils

#### Load
Preprocessed Data

In [2]:
df = pd.read_csv("../data/2024/player_attributes.csv", sep=';')
df_player_information = pd.read_csv("../data/2024/player_information.csv", sep=';')

display(df_player_information)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
0,231747,K. Mbappé,Paris Saint Germain,24,91,"ST, LW",Right,France,181500000.0,230000.0,5
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0,5
2,192985,K. De Bruyne,Manchester City,32,91,"CM, CAM",Right,Belgium,103000000.0,350000.0,5
3,158023,L. Messi,Inter Miami,36,90,"CF, CAM",Left,Argentina,41000000.0,23000.0,5
4,165153,K. Benzema,Al Ittihad,35,90,"CF, ST",Right,France,51000000.0,95000.0,5
...,...,...,...,...,...,...,...,...,...,...,...
18258,273759,S. Telem,Jamshedpur,20,47,CB,Left,India,100000.0,500.0,1
18259,269013,Jin Liangkuan,Meizhou Hakka,21,47,CB,Right,China PR,100000.0,1000.0,1
18260,272748,M. Mewlan,Shandong Taishan,19,47,ST,Right,China PR,100000.0,2000.0,1
18261,269534,Yin Jie,Zhejiang,21,47,CM,Right,China PR,90000.0,1000.0,1


#### Prepare Data and get embeddings

In [3]:
ids = df['player_id'].astype(str).tolist()
df_cleaned = df.drop(columns=['player_id', 'short_name'])
embeddings = df_cleaned.values.tolist()

len(embeddings), df_cleaned.shape

(18263, (18263, 75))

#### Vector Database

In [4]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player-vector-database",
    metadata={"hnsw:space": "cosine"}
)

#### Add items

In [5]:
collection.add(
    embeddings= embeddings,
    #metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=ids
)

#### Query

In [6]:
### custom target
target = df[df['player_id'] == 239085]

# prepare target
target_id = target['player_id'].iloc[0]
target_cleaned = target.drop(columns=['player_id', 'short_name'])
target_embedding = target_cleaned.values.tolist()

# display query object
print("Query Player")
display(df_player_information[df_player_information['player_id'] == target_id])


Query Player


Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0,5


In [7]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=18263
)

query_idx = list(results.get('ids')[0])

Evaluate Query Results

In [8]:
# calculate percentage of similarity
distances = results['distances'][0] 
similarity_percentage = []
for dist in distances:
    similarity_percentage.append((1 - dist) * 100)
    #print(f"Similarity percentage with base query item: {((1 - dist) * 100):.2f}%")

# store the results and create result dataframe
result_dic = {
    'id' : query_idx,
    'similarity percentage' : similarity_percentage,
    'similarity' : distances
}
result_df = pd.DataFrame.from_dict(result_dic)
result_df
result_df['id'] = result_df['id'].astype(int)

# merge cosine similarites with player information
merged_data = pd.merge(df_player_information, result_df, left_on='player_id', right_on='id', how='left').drop(columns=['id'])
print(merged_data.columns)


Index(['player_id', 'short_name', 'club_name', 'age', 'overall',
       'player_positions', 'preferred_foot', 'nationality_name', 'value_eur',
       'wage_eur', 'international_reputation', 'similarity percentage',
       'similarity'],
      dtype='object')


### Query Top 10 Results

In [15]:
merged_data.sort_values('similarity',ascending=True)[:10]

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,185000000.0,340000.0,5,99.999982,1.788139e-07
75,192387,C. Immobile,Lazio,33,85,ST,Right,Italy,34500000.0,90000.0,4,98.930234,0.01069766
124,192505,R. Lukaku,Roma,30,84,ST,Left,Belgium,41500000.0,165000.0,4,98.686141,0.01313859
18,232293,V. Osimhen,Napoli,24,88,ST,Right,Nigeria,126500000.0,120000.0,3,98.369312,0.01630688
5,188545,R. Lewandowski,FC Barcelona,34,90,ST,Right,Poland,58000000.0,340000.0,5,98.358369,0.01641631
165,234236,P. Schick,Bayer 04 Leverkusen,27,83,ST,Left,Czech Republic,42500000.0,80000.0,3,98.356229,0.01643771
202,253072,D. Núñez,Liverpool,24,82,"ST, LW",Right,Uruguay,48500000.0,120000.0,3,98.321491,0.01678509
179,201153,Morata,Atlético Madrid,30,83,ST,Right,Spain,35500000.0,80000.0,3,98.197126,0.01802874
416,228941,André Silva,Real Sociedad,27,80,ST,Right,Portugal,24500000.0,63000.0,3,98.14291,0.0185709
1057,226537,V. Janssen,Antwerp,29,77,ST,Left,Netherlands,10500000.0,35000.0,3,98.107916,0.01892084


### Top 10 + Filtering

In [13]:
# Sort players 
top_players = merged_data.sort_values('similarity',ascending=True)

# output the results
output_df = top_players.loc[(top_players['value_eur']<10000000) & (top_players['age'] < 25)][:10]
output_df['value_eur'] = output_df['value_eur'].apply(utils.adjust_money_appearance)
output_df['wage_eur'] = output_df['wage_eur'].apply(utils.adjust_money_appearance)
display(output_df)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
1837,247679,V. Boniface,Bayer 04 Leverkusen,22,74,ST,Right,Nigeria,€8.5 Mio,€37.0k,1,97.106445,0.028936
2377,234671,J. Odgaard,AZ Alkmaar,24,73,"RW, ST",Left,Denmark,€4.6 Mio,€9000,1,96.935862,0.030641
1850,233795,B. Brereton Díaz,Villarreal,24,74,"LM, ST",Right,Chile,€6.5 Mio,€23.0k,1,96.579587,0.034204
1486,246919,T. Čvančara,Borussia Mönchengladbach,22,75,"ST, RW",Right,Czech Republic,€8.5 Mio,€24.0k,1,96.537393,0.034626
1806,244257,J. Burkardt,FSV Mainz 05,22,74,"ST, CF",Right,Germany,€9.5 Mio,€23.0k,1,96.407002,0.03593
2296,238756,J. Larsen,Celta de Vigo,23,73,ST,Right,Norway,€7.0 Mio,€18.0k,1,96.376771,0.036232
3645,263296,L. Gondou,Argentinos Juniors,22,71,ST,Left,Argentina,€3.8 Mio,€9000,1,96.079272,0.039207
5253,277832,C. Ikwuemesi,Salernitana,21,69,ST,Right,Nigeria,€3.2 Mio,€12.0k,1,96.04876,0.039512
1465,265800,Jutglà,Club Brugge,24,75,ST,Right,Spain,€9.0 Mio,€21.0k,1,95.98673,0.040133
2418,242075,J. Sargent,Norwich City,23,73,"ST, RM",Right,United States,€4.4 Mio,€27.0k,1,95.88452,0.041155


In [11]:
# filter params: df, age_younger, age_older, age, wage_less, wage_more, value_less, value_more
def filter(df,top_n=10, age=None, age_younger=None, age_older=None, value_less=None,value_more=None, wage=None):
    
    if(age_younger):
        df = df.loc[(df['age'] <= age_younger)][:top_n]

    if(age_younger and value_less):
        df = df.loc[(df['value_eur']<=value_less) & (df['age'] <= age_younger)][:top_n]

    if(age_younger and value_more):
        df = df.loc[(df['value_eur']>=value_more) & (df['age'] <= age_younger)][:top_n]

    if(age_older):
        df = df.loc[(df['age'] >= age_older)][:top_n]

    if(age_older and value_less):
        df = df.loc[(df['value_eur']<value_less) & (df['age'] >= age_older)][:top_n]

    if(age_older and value_more):
        df = df.loc[(df['value_eur']>=value_more) & (df['age'] >= age_older)][:top_n]

        
    # prepare return df
    df['value_eur'] = df['value_eur'].apply(utils.adjust_money_appearance)
    df['wage_eur'] = df['wage_eur'].apply(utils.adjust_money_appearance)
    return df


filter(top_players, age_older=17, value_less= 10000000000)

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
1,239085,E. Haaland,Manchester City,22,91,ST,Left,Norway,€185.0 Mio,€340.0k,5,99.999982,1.788139e-07
75,192387,C. Immobile,Lazio,33,85,ST,Right,Italy,€34.5 Mio,€90.0k,4,98.930234,0.01069766
124,192505,R. Lukaku,Roma,30,84,ST,Left,Belgium,€41.5 Mio,€165.0k,4,98.686141,0.01313859
18,232293,V. Osimhen,Napoli,24,88,ST,Right,Nigeria,€126.5 Mio,€120.0k,3,98.369312,0.01630688
5,188545,R. Lewandowski,FC Barcelona,34,90,ST,Right,Poland,€58.0 Mio,€340.0k,5,98.358369,0.01641631
165,234236,P. Schick,Bayer 04 Leverkusen,27,83,ST,Left,Czech Republic,€42.5 Mio,€80.0k,3,98.356229,0.01643771
202,253072,D. Núñez,Liverpool,24,82,"ST, LW",Right,Uruguay,€48.5 Mio,€120.0k,3,98.321491,0.01678509
179,201153,Morata,Atlético Madrid,30,83,ST,Right,Spain,€35.5 Mio,€80.0k,3,98.197126,0.01802874
416,228941,André Silva,Real Sociedad,27,80,ST,Right,Portugal,€24.5 Mio,€63.0k,3,98.14291,0.0185709
1057,226537,V. Janssen,Antwerp,29,77,ST,Left,Netherlands,€10.5 Mio,€35.0k,3,98.107916,0.01892084


use `query()`

In [12]:
top_players.query('value_eur < 5500000 & wage_eur == 43000')

Unnamed: 0,player_id,short_name,club_name,age,overall,player_positions,preferred_foot,nationality_name,value_eur,wage_eur,international_reputation,similarity percentage,similarity
2865,254243,E. Anderson,Newcastle United,20,72,"CM, LW",Right,Scotland,5000000.0,43000.0,1,93.588722,0.064113
2644,205360,K. Roofe,Rangers,30,73,"ST, CF",Right,Jamaica,3000000.0,43000.0,1,93.576646,0.064234
989,198946,D. D'Ambrosio,Monza,34,77,CB,Right,Italy,3400000.0,43000.0,2,92.932636,0.070674
2044,189218,R. Jack,Rangers,31,74,"CM, CDM",Right,Scotland,3500000.0,43000.0,1,92.496425,0.075036
3571,270390,Marquinhos,Nantes,20,71,"RW, RM",Left,Brazil,4100000.0,43000.0,1,91.70841,0.082916
3545,267991,M. Perrone,Las Palmas,20,71,"CM, CDM",Left,Argentina,4500000.0,43000.0,1,88.994533,0.110055
3100,210635,K. Hause,Aston Villa,27,72,CB,Left,England,2500000.0,43000.0,1,88.336527,0.116635
3289,203775,L. Karius,Newcastle United,30,72,GK,Right,Germany,1600000.0,43000.0,2,64.240032,0.3576
1973,206652,Sergio Rico,Paris Saint Germain,29,74,GK,Right,Spain,3200000.0,43000.0,2,64.171427,0.358286


#### TODO
- Similarity Comparison to FC Bayern IV

#### Learnings
- Position kann manchmal Ärger machen
- Alter sollte ein Filter sein
- Empfehlungen sind nicht immer absehbar