# PART-3 Create Milvus Database

In [None]:
import pandas as pd
from tqdm import tqdm

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

from dotenv import load_dotenv
load_dotenv()

In [None]:
df = pd.read_csv(os.getenv("EMBEDDINGS_PATH"))

In [None]:
num_features = len(eval(df['embeddings'][0]))

In [None]:
connections.connect("default", host="localhost", port="19530")

In [None]:
fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="job_number", dtype=DataType.VARCHAR, max_length=128),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=num_features)
]

In [None]:
schema = CollectionSchema(fields, "Job Description Schema")
job_collection = Collection("jobs", schema)

In [None]:
features_2d_list = df.loc[:100]["embeddings"].apply(eval).tolist()
job_number_list = df.loc[:100]["Job Id"].tolist()
pk_list = list(df.loc[:100].index)

In [None]:
batch_size = 100
for i in range(0, len(features_2d_list), batch_size):
    batch_pk = pk_list[i:i+batch_size]
    batch_job_ids = job_number_list[i:i+batch_size]
    batch_features = features_2d_list[i:i+batch_size]
    print(job_collection.insert([batch_pk, batch_job_ids, batch_features]))

# Search for parameters

In [None]:
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params":{"nlist":32}
}

In [None]:
job_collection.create_index(
    field_name="embeddings",
    index_params=index_params
)

In [None]:
job_collection.load()

In [None]:
search_params = {"metric_type": "L2", "params": {"nprobe": 16}}

In [None]:
duplicates = []
frontier = set()
for idx, query_vector in tqdm(enumerate(features_2d_list[:100]), total=len(features_2d_list[:100])):
    if idx in frontier:
        continue
    frontier.add(idx)
    group = [idx]
    results = job_collection.search(data=[query_vector], anns_field="embeddings", param=search_params, limit=10)
    for result in results[0]:
        if result.distance < 0.2:
            if result.id not in frontier:
                frontier.add(result.id)
                group.append(result.id)
    duplicates.append(group)

for group in duplicates:
    if len(group) > 1:
        print(group)


In [None]:
for group in duplicates:
    if len(group) > 1:
        for id in group:
            print(df.loc[id]["Job Description"][:200])
        print("*"*30)