In [1]:
import pandas as pd
import numpy as np
import chromadb

# Motivation
The Goal is to create vector data bases that gets filled with data about the football players which allows to determine similar players based on their attributes and characteristics. This notebook shall help to build the basic functionality to achieve this goal. These are the steps which are covered in this notebook.
* Build a Vector Database
* Load Players into the Database
* Perform testing queries
* Draw conclusion

The first step is to load the preprocessed data.

In case you want to use Pinecone vector database:

`load_dotenv(find_dotenv())
api_key = os.getenv("PINECONE_API_KEY")
print(api_key)`

#### Load player statistics and player information

In [2]:
# load data
df = pd.read_csv('../data/preprocessed_data.csv', sep=',')
df_player = pd.read_csv('../data/player_data.csv', sep=',')

# delete anormal column
df = df.drop(columns='Unnamed: 0')
df_player = df_player.drop(columns='Unnamed: 0')

df_player

Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
0,212198,Bruno Fernandes,26,88,CAM,Portugal,Manchester United,€250K,€107.5M
1,209658,L. Goretzka,26,87,LDM,Germany,FC Bayern München,€140K,€93M
2,176580,L. Suárez,34,88,RS,Uruguay,Atlético de Madrid,€135K,€44.5M
3,192985,K. De Bruyne,30,91,RCM,Belgium,Manchester City,€350K,€125.5M
4,224334,M. Acuña,29,84,LB,Argentina,Sevilla FC,€45K,€37M
...,...,...,...,...,...,...,...,...,...
16705,240558,18 L. Clayton,17,53,RES,England,Cheltenham Town,€1K,€100K
16706,262846,�. Dobre,20,53,RES,Romania,FC Academica Clinceni,€550,€180K
16707,241317,21 Xue Qinghao,19,47,RES,China PR,Shanghai Shenhua FC,€700,€100K
16708,259646,A. Shaikh,18,47,SUB,India,ATK Mohun Bagan FC,€500,€110K


Drop unnesseracy columns.

### Create Vector Database

In [3]:
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name = "player_database",
        metadata={"hnsw:space": "cosine"}
)


#### Prepare Data
* The `id` needs is denoted from the `df` dataframe and stored seperately into one dimensional array/list. 
* Each row/player needs to be converted into one dimensional array and gets added to the collection

For testing purposes only 50 players are entered in the first place.

In [4]:
# get embeddings and ids
ids = df['ID'].astype(str).tolist()
df_cleaned = df.drop(columns='ID')
embeddings = df_cleaned.values.tolist()


### Add items

In [5]:
# add 50 samples to vector database
collection.add(
    embeddings= embeddings,
    #metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=ids
)

#### Query item
Create custom query

In [6]:
### random player
#target = df.sample(n=1, random_state=42)

### custom target
target = df[df['ID'] == 243042]

# prepare target
target_id = target['ID'].iloc[0]
target_cleaned = target.drop(columns='ID')
target_embedding = target_cleaned.values.tolist()

In [10]:
query_player = target_embedding

results = collection.query(
    query_embeddings=query_player,
    n_results=10
)
#

# query result object
print("Queried Player")
display(df_player[df_player['ID'] == target_id])

query_idx = list(results.get('ids')[0])

# output player names
print("Similar players")
for idx in query_idx:
    id_int = int(idx)
    display(df_player[df_player['ID'] == id_int])

Queried Player


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14900,243042,N. Kastenhofer,22,59,LCB,Germany,Hallescher FC,€700,€475K


Similar players


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14900,243042,N. Kastenhofer,22,59,LCB,Germany,Hallescher FC,€700,€475K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
15383,263022,M. Rosenfelder,18,57,LCB,Germany,SC Freiburg II,€500,€350K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14863,253025,21 J. Hamilton,18,62,RCB,Scotland,Hamilton Academical FC,€500,€950K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14126,204219,A. Jackson,27,65,LCB,England,Lincoln City,€3K,€650K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
15115,262917,M. Dietz,19,57,SUB,United States,SC Freiburg II,€500,€375K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14821,262731,C. Taylor,19,58,RCB,England,Bristol Rovers,€2K,€450K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14764,236677,I. Tapia,22,62,LCB,Chile,CD Huachipato,€1K,€925K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14296,261422,A. Nagalo,18,60,LCB,Côte d'Ivoire,FC Nordsjælland,€550,€525K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
15134,262668,J. Makengo,19,56,RES,France,SC Freiburg II,€500,€325K


Unnamed: 0,ID,Name,Age,Overall,Position,Nationality,Club,Wage,Value
14813,259187,E. Oubella,20,58,RES,Germany,Heracles Almelo,€950,€450K


In [11]:
# Assuming results contains the similarity distances for the top 10 entries
distances = results['distances'][0]  # Extracting distances from the results

# Calculate the percentage similarity for each entry
# Assuming smaller distance implies higher similarity
max_distance = max(distances)
min_distance = min(distances)

# Normalize distances to a percentage scale (higher distance = lower similarity)
percentages = [((max_distance - dist) / (max_distance - min_distance)) * 100 for dist in distances]

# Display the percentage similarities
for i, percentage in enumerate(percentages, 1):
    print(f"Item {i}: Similarity Percentage = {percentage:.2f}%")


Item 1: Similarity Percentage = 100.00%
Item 2: Similarity Percentage = 48.83%
Item 3: Similarity Percentage = 23.84%
Item 4: Similarity Percentage = 22.83%
Item 5: Similarity Percentage = 12.30%
Item 6: Similarity Percentage = 9.05%
Item 7: Similarity Percentage = 8.65%
Item 8: Similarity Percentage = 3.19%
Item 9: Similarity Percentage = 2.92%
Item 10: Similarity Percentage = 0.00%
