In [None]:
import json, ollama
import pandas as pd
import numpy as np
from modules import userinput
from modules import filehandler
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

pd.set_option('display.max_colwidth', None)

# user_choice = userinput.get_user_input("Select file by index (0, 1, 2, ...): ", "0")
# selected_file = file_list[int(user_choice)]
selected_file = 'input/20251120-mfi-products.csv'

# Load the CSV file into a DataFrame
# the default file encoding type is 'utf_8', change to e.g. 'cp1252' if needed
# the default field separator is ';', change to something else (e.g. ',') if needed
df = filehandler.read_csv_file(selected_file)

drop_columns = ['shelflife', 'CN_code', 'country_of_origin']

df_temp = df.copy()
df_temp = df_temp.drop(columns=drop_columns)
df_temp = df_temp.rename(columns={'product_name_alias': 'title', 'product_description': 'description'})


# Function to keep only string values and True boolean values
def keep_string_or_true(row):
    kept = {}
    for col, val in row.items():
        if pd.isna(val):
            continue
        if isinstance(val, str):
            if val.strip() != "":
                kept[col] = val
        elif val is True:
            kept[col] = val
    return json.dumps(kept)

# Add a column with per-row dict of kept values
df_temp['json_data'] = df_temp.apply(keep_string_or_true, axis=1)


# Option A: keep the dicts (one cell per row)
# filtered_series = df_temp['filtered']

# Option B: expand dicts back to columns (sparse; missing entries become NaN)
# filtered_df = pd.json_normalize(filtered_series).reindex(df_temp.index)

# If you want df_temp to be the expanded result uncomment:
# df_temp = filtered_df


keep_columns = ['title', 'description', 'json_data']
df_temp = df_temp[keep_columns]


# Function to generate embeddings
def generate_embeddings(o, m):
    # model = "mxbai-embed-large" # byte size of vector is 8248
    return ollama.embeddings(m, o)["embedding"]


df_temp['embedding'] = df_temp['json_data'].apply(lambda x: generate_embeddings(x, "mxbai-embed-large"))
# df_temp.head(-3)

measurer = np.vectorize(len)

vector_size = measurer(df_temp['embedding']).max(axis=0)
# print(f"Embedding vector size: {vector_size}")
# print(f"Embedding vector sizes: min={vector_size.min()}, max={vector_size.max()}, mean={vector_size.mean()}")


# Create a collection
collection_name = "products_collection"
payload = ["title", "description"]


def prepare_data(df, payload):
    # Prepare data for insertion
    points = []
    for idx, row in df.iterrows():
        point = PointStruct(
            id=idx,
            vector=row['embedding'],
            payload={
                "title": row[payload[0]],
                "description": row[payload[1]]
            }
        )
        points.append(point)
    return points


# Define collection
def define_collection(c, n, s):
    if not c.collection_exists(n):
        c.create_collection(
            collection_name=n,
            vectors_config=VectorParams(size=s, distance=Distance.COSINE)
        )


# Insert data into the collection
def insert_data(c, n, p):
    c.upsert(
        collection_name=n,
        points=p
    )


def db_init(collection_name, df, payload, vector_size):
    # Initialize Qdrant client
    # qdrant_client = QdrantClient(host="localhost", port=6333)
    qdrant_client = QdrantClient(":memory:")
    define_collection(qdrant_client, collection_name, vector_size)

    points = prepare_data(df, payload)
    insert_data(qdrant_client, collection_name, points)

    return qdrant_client