In [1]:
from langchain_core.documents import Document
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import os

In [2]:
from qdrant_client import QdrantClient

load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='flowers_policy'), CollectionDescription(name='flowers'), CollectionDescription(name='mlops_document')]


In [3]:
qdrant_client.delete_collection(collection_name="transaction_description")

True

# 1. Load dataset

In [4]:
df = pd.read_feather("../dataset.feather")
df.head()

Unnamed: 0,transaction_description,sensitive_type,sensitive_label,transaction_type
0,High-performance computer processor for use in...,dual_use_items,sensitive,
1,Advanced encryption software for secure commun...,dual_use_items,sensitive,
2,High-sensitivity microphones for use in survei...,dual_use_items,sensitive,
3,High-resolution cameras for use in surveillanc...,dual_use_items,sensitive,
4,Advanced navigation system for use in aviation...,dual_use_items,sensitive,


In [5]:
len(df)

9925

# 2. Ingest into vector db

In [6]:
from vector_shield.vector import QdrantVectorStorePredictor

In [7]:
model_name = "jinaai/jina-embeddings-v3"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': False}
jina_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

nomic_embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5"
)

mxbai_embeddings = OllamaEmbeddings(
    model="mxbai-embed-large:latest"
)

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

In [7]:
qdrant_db_predictor = QdrantVectorStorePredictor(
    embeddings=jina_embeddings,
    retrieval_top_k=20,
    reranker_top_k=5,
    db_url=config.QDRANT_URL,
    api_key=config.QDRANT_API_KEY,
    collection_name="transaction_description"
)

In [8]:
qdrant_db_predictor.fit(
    df=df,
    transaction_description="transaction_description",
    sensitive_type="sensitive_type",
    sensitive_label="sensitive_label",
    transaction_type="transaction_type"
)

Preparing chunks: 100%|██████████| 9925/9925 [00:00<00:00, 14190.06it/s]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.


# 2. Retrieve from Vector DB

In [92]:
transaction_description = """
AK-47 with 7.62mm bullet
"""

relevant_docs = qdrant_db_predictor.retrieve(transaction_description)
relevant_docs

[Document(metadata={'id': 16, 'relevance_score': 0.05921118, 'sensitive_type': 'weaponry', 'transaction_type': None, 'sensitive_label': 'sensitive', '_id': 'e9504535-e00e-4fcb-8ec5-3e02194f6375', '_collection_name': 'transaction_description'}, page_content='Advanced machine gun with high-rate of fire and armor-piercing ammunition.'),
 Document(metadata={'id': 15, 'relevance_score': 0.03589097, 'sensitive_type': 'weaponry', 'transaction_type': None, 'sensitive_label': 'sensitive', '_id': 'fa47ee1f-6516-446b-a957-4a64eae93d93', '_collection_name': 'transaction_description'}, page_content='Advanced main battle rifle with high-velocity ammunition and ballistic computer.'),
 Document(metadata={'id': 17, 'relevance_score': 0.034532577, 'sensitive_type': 'weaponry', 'transaction_type': None, 'sensitive_label': 'sensitive', '_id': '36c678c4-a1d2-47b2-b966-632d9e05e765', '_collection_name': 'transaction_description'}, page_content='High-explosive and armor-piercing ammunition for use in small a

In [93]:
predict_data = qdrant_db_predictor.predict(transaction_description, explain=False)
predict_data

{'prediction': 'sensitive', 'probability_score': 1.0}

In [94]:
predict_data = qdrant_db_predictor.predict(transaction_description, explain=True)
print(predict_data['explain'])

The transaction description seems having similar meaning with the **sensitive** descriptions below: 

1.**Transaction description:** Advanced machine gun with high-rate of fire and armor-piercing ammunition.
  **Sensitive Type:** weaponry
  **Transaction Type:** None

2.**Transaction description:** Advanced main battle rifle with high-velocity ammunition and ballistic computer.
  **Sensitive Type:** weaponry
  **Transaction Type:** None

3.**Transaction description:** High-explosive and armor-piercing ammunition for use in small arms and machine guns.
  **Sensitive Type:** weaponry
  **Transaction Type:** None

4.**Transaction description:** High-velocity armor-piercing bullet for anti-tank and anti-vehicle applications
  **Sensitive Type:** weaponry
  **Transaction Type:** None

5.**Transaction description:** High-velocity sniper bullet for anti-personnel and anti-vehicle applications
  **Sensitive Type:** weaponry
  **Transaction Type:** None




# 3. Load test data

In [9]:
test_df = pd.read_excel('test_data.xlsx')

In [10]:
test_df.head()

Unnamed: 0,Description,Sensitive_Label,Is_Sensitive
0,Shirt in airy cotton muslin with a resort coll...,Not_Sensitive,0
1,T-shirt in lightweight cotton jersey with a ro...,Not_Sensitive,0
2,Trousers in an airy cotton and linen weave wit...,Not_Sensitive,0
3,Short running shorts in DryMove™ functional fa...,Not_Sensitive,0
4,"Lightweight, slim-fit running vest in breathab...",Not_Sensitive,0


In [98]:
predict_array = []
for index, row in tqdm(test_df.iterrows(), desc="Predicting...", total=len(test_df)):
    transaction_description = row['Description']
    predict_data = qdrant_db_predictor.predict(transaction_description, explain=False)

    if predict_data['prediction'] == 'sensitive':
        predict_array.append(1)
    else:
        predict_array.append(0)

Predicting...: 100%|██████████| 50/50 [00:40<00:00,  1.25it/s]


In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_classification_metrics(y_true, y_pred):
    """
    Evaluate classification metrics: Precision, Recall, and F1 Score.

    Parameters:
        y_true (list or array): Ground truth (true labels).
        y_pred (list or array): Predicted labels.

    Returns:
        dict: Dictionary containing precision, recall, and F1 score.
    """
    precision = precision_score(y_true, y_pred, average='binary')  # Change to 'macro' or 'weighted' for multiclass
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }



In [13]:
# Example Usage
y_true = test_df['Is_Sensitive']  # Ground truth labels
y_pred = predict_array # Predicted labels

metrics = evaluate_classification_metrics(y_true, y_pred)
print(metrics)


{'precision': 0.8275862068965517, 'recall': 0.96, 'f1_score': 0.8888888888888888}


In [None]:
qdrant_db_predictor.delete_collection()

## 4. Hyperparameter Tunning

In [14]:
import itertools

# Define the parameters
params_list = {
    'embedding_model': ['jina_embeddings', 'nomic_embeddings', 'mxbai_embeddings'],
    'reranker_top_k': [3, 5, 7]
}

# Generate all combinations
keys, values = zip(*params_list.items())
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

# Print the results
for combo in combinations:
    print(combo)


{'embedding_model': 'jina_embeddings', 'reranker_top_k': 3}
{'embedding_model': 'jina_embeddings', 'reranker_top_k': 5}
{'embedding_model': 'jina_embeddings', 'reranker_top_k': 7}
{'embedding_model': 'nomic_embeddings', 'reranker_top_k': 3}
{'embedding_model': 'nomic_embeddings', 'reranker_top_k': 5}
{'embedding_model': 'nomic_embeddings', 'reranker_top_k': 7}
{'embedding_model': 'mxbai_embeddings', 'reranker_top_k': 3}
{'embedding_model': 'mxbai_embeddings', 'reranker_top_k': 5}
{'embedding_model': 'mxbai_embeddings', 'reranker_top_k': 7}


In [15]:
best_f1_score = 0
best_combination = {
    'embedding_model':'',
    'reranker_top_k':0
}

for combo in tqdm(combinations, desc="Finding best combinations", total=len(combinations)):
    if combo['embedding_model'] == 'jina_embeddings':
        embeddings = jina_embeddings
    elif combo['embedding_model'] == 'nomic_embeddings':
        embeddings = nomic_embeddings
    elif combo['embedding_model'] == 'mxbai_embeddings':
        embeddings = mxbai_embeddings
        
    reranker_top_k = combo['reranker_top_k']

    qdrant_db_predictor = QdrantVectorStorePredictor(
        embeddings=embeddings,
        retrieval_top_k=20,
        reranker_top_k=reranker_top_k,
        db_url=config.QDRANT_URL,
        api_key=config.QDRANT_API_KEY,
        collection_name="transaction_description"
    )
    qdrant_db_predictor.fit(
        df=df,
        transaction_description="transaction_description",
        sensitive_type="sensitive_type",
        sensitive_label="sensitive_label",
        transaction_type="transaction_type"
    )

    predict_array = []
    for index, row in tqdm(test_df.iterrows(), desc="Predicting...", total=len(test_df)):
        transaction_description = row['Description']
        predict_data = qdrant_db_predictor.predict(transaction_description, explain=False)

        if predict_data['prediction'] == 'sensitive':
            predict_array.append(1)
        else:
            predict_array.append(0)

    y_true = test_df['Is_Sensitive']  # Ground truth labels
    y_pred = predict_array # Predicted labels
    
    metrics = evaluate_classification_metrics(y_true, y_pred)

    f1 = metrics['f1_score']

    if f1 > best_f1_score:
        best_f1_score = f1 
        best_combination = {
            'embedding_model':combo['embedding_model'],
            'reranker_top_k':combo['reranker_top_k']
        }

    qdrant_db_predictor.delete_collection()


Preparing chunks: 100%|██████████| 9925/9925 [00:00<00:00, 13971.57it/s]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.


Predicting...: 100%|██████████| 50/50 [00:46<00:00,  1.07it/s]
INFO:httpx:HTTP Request: DELETE https://8a65de03-3705-4e08-8274-f1b13158eb2c.europe-west3-0.gcp.cloud.qdrant.io:6333/collections/transaction_description "HTTP/1.1 200 OK"
Preparing chunks: 100%|██████████| 9925/9925 [00:00<00:00, 16786.10it/s]]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.


Predicting...: 100%|██████████| 50/50 [00:47<00:00,  1.05it/s]
INFO:httpx:HTTP Request: DELETE https://8a65de03-3705-4e08-8274-f1b13158eb2c.europe-west3-0.gcp.cloud.qdrant.io:6333/collections/transaction_description "HTTP/1.1 200 OK"
Preparing chunks: 100%|██████████| 9925/9925 [00:01<00:00, 7737.77it/s]t]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.


Predicting...: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s]
INFO:httpx:HTTP Request: DELETE https://8a65de03-3705-4e08-8274-f1b13158eb2c.europe-west3-0.gcp.cloud.qdrant.io:6333/collections/transaction_description "HTTP/1.1 200 OK"
Preparing chunks: 100%|██████████| 9925/9925 [00:01<00:00, 9399.97it/s]t]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

Ingest to Vector Database End.


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POS

In [18]:
best_f1_score

0.8727272727272727

In [19]:
best_combination

{'embedding_model': 'jina_embeddings', 'reranker_top_k': 5}

## 5. Ingest the data using Jina Embeddings

In [8]:
from vector import QdrantVectorStorePredictor

In [9]:
model_name = "jinaai/jina-embeddings-v3"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': False}
jina_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

In [10]:
qdrant_db_predictor = QdrantVectorStorePredictor(
    embeddings=jina_embeddings,
    retrieval_top_k=20,
    reranker_top_k=5,
    db_url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    collection_name="transaction_description"
)

In [11]:
qdrant_db_predictor.fit(
    df=df,
    transaction_description="transaction_description",
    sensitive_type="sensitive_type",
    sensitive_label="sensitive_label",
    transaction_type="transaction_type"
)

Preparing chunks: 100%|██████████| 9925/9925 [00:00<00:00, 15897.37it/s]


Ingest to Vector Database Start.


Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

Ingest to Vector Database End.
