In [1]:
import pandas as pd
import numpy as np

# Motivation
The Goal is to create vector data bases that gets filled with data about the football players which allows to determine similar players based on their attributes and characteristics. This notebook shall help to build the basic functionality to achieve this goal. These are the steps which are covered in this notebook.
* Build a Vector Database
* Load Players into the Database
* Perform testing queries
* Draw conclusion

The first step is to load the preprocessed data.

In case you want to use Pinecone vector database:

`load_dotenv(find_dotenv())
api_key = os.getenv("PINECONE_API_KEY")
print(api_key)`

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv', sep=',')
df = df.drop(columns='Unnamed: 0')
df.columns

Index(['ID', 'Overall', 'Wage', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Height', 'Weight', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'DefensiveAwareness',
       'Preferred Foot_Left', 'Preferred Foot_Right', 'Work Rate Offense',
       'Work Rate Defense', 'Body Type_Lean', 'Body Type_Normal',
       'Body Type_Stocky', 'Body Type_Unique', 'Global Position_defense',
       'Global Position_goalkeeper', 'Global Position_midfield',
       'Global Position_offense', 'Global Position_outlier', 'Position_CAM',
       

Drop unnesseracy columns.

### Create Vector Database

In [3]:
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name = "player_database")

#### Prepare Data
* The `id` needs is denoted from the `df` dataframe and stored seperately into one dimensional array/list. 
* Each row/player needs to be converted into one dimensional array and gets added to the collection

For testing purposes only 50 players are entered in the first place.

In [4]:
df = df.sample(n=50, random_state=42) 
id_array = df['ID'].values
row_array_1 = df.iloc[0].values
row_array_2 = df.iloc[1].values
id1, id2 = str(id_array[0]), str(id_array[1])
id1

'212198'

### Add items

In [5]:
arr = row_array_1.tolist()
arr2 = row_array_2.tolist()

embeddings = [arr, arr2]
collection.add(
    embeddings= embeddings,
    metadatas=[{"source": "player 1"}, {"source": "player 2"}],
    ids=[id1, id2]
)

In [None]:
"""
collection.add(
    embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
    metadatas=[{"source": "source a"}, {"source": "source b"}],
    ids=["id3", "id4"]
)

"""

### Query item

In [7]:
query_player = (temp_df.iloc[2].values).tolist()

results = collection.query(
    query_embeddings=query_player,
    n_results=1
)

In [8]:
print(results)

{'ids': [['id2']], 'distances': [[151339232.0]], 'metadatas': [[{'source': 'player 2'}]], 'embeddings': None, 'documents': [[None]], 'uris': None, 'data': None}


In [None]:
"""
# Iterate through the DataFrame to insert data into ChromaDB
for index, row in df.iterrows():
    document = {
        'id': row['ID'],
        'features': row.drop('ID').to_dict()  # Assuming 'id' is the identifier and other columns are features
    }
    collection.add(document)

# Optionally, you can create an index on the 'id' field for faster retrieval
#chroma_client.create_index("id")

# Close the connection to ChromaDB
#chroma_client.close()
"""

'\n# Iterate through the DataFrame to insert data into ChromaDB\nfor index, row in df.iterrows():\n    document = {\n        \'id\': row[\'ID\'],\n        \'features\': row.drop(\'ID\').to_dict()  # Assuming \'id\' is the identifier and other columns are features\n    }\n    collection.add(document)\n\n# Optionally, you can create an index on the \'id\' field for faster retrieval\n#chroma_client.create_index("id")\n\n# Close the connection to ChromaDB\n#chroma_client.close()\n'