In [5]:
# Connect to Milvus
from pymilvus import MilvusClient

host = "localhost"
port = "19530"

milvus_client = MilvusClient(
    host=host,
    port=port
)


In [6]:
# Define schema for the collection
from pymilvus import FieldSchema, CollectionSchema, DataType

VECTOR_LENGTH = 768  # Dimensionality for Silver Retriever Base (v1.1)

# Define the fields
id_field = FieldSchema(
    name="id",
    dtype=DataType.INT64,
    is_primary=True,
    description="Primary id"
)
text_field = FieldSchema(
    name="text",
    dtype=DataType.VARCHAR,
    max_length=4096,
    description="Original text content"
)
embedding_field = FieldSchema(
    name="embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=VECTOR_LENGTH,
    description="Text embeddings"
)

# Create list of fields
fields = [id_field, text_field, embedding_field]

# Define the full collection schema
schema = CollectionSchema(
    fields=fields,
    auto_id=True,                # Let Milvus generate IDs automatically
    enable_dynamic_field=True,   # Allow adding extra fields easily later
    description="RAG Texts Collection"
)


In [7]:
# Create the collection and index it
COLLECTION_NAME = "rag_texts_and_embeddings"

# Create collection
milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema
)

# Prepare and create index
index_params = milvus_client.prepare_index_params()

index_params.add_index(
    field_name="embedding",
    index_type="HNSW",
    metric_type="L2",
    params={"M": 4, "efConstruction": 64}
)

milvus_client.create_index(
    collection_name=COLLECTION_NAME,
    index_params=index_params
)

# Check collection
print(milvus_client.list_collections())

# Describe collection
print(milvus_client.describe_collection(COLLECTION_NAME))


['rag_texts_and_embeddings']
{'collection_name': 'rag_texts_and_embeddings', 'auto_id': True, 'num_shards': 1, 'description': 'RAG Texts Collection', 'fields': [{'field_id': 100, 'name': 'id', 'description': 'Primary id', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'text', 'description': 'Original text content', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': 'Text embeddings', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'functions': [], 'aliases': [], 'collection_id': 457622759785627888, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True, 'created_timestamp': 457622772028276739}


In [8]:
# Define data source and destination paths
## URL of the document to be downloaded
pdf_url = "https://www.iab.org.pl/wp-content/uploads/2024/04/Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.pdf"

## Local destination for the PDF document
file_name = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.pdf"

## Local destination for the processed JSON version of the document
file_json = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.json"

## Local destination for the embedded pages (after vectorization)
embeddings_json = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska-Embeddings.json"

## Folder to store all the above local files
data_dir = "./data"

# Create the directory if it doesn't exist
import os
if not os.path.exists(data_dir):
    os.makedirs(data_dir)


In [9]:
# Import required libraries
import os
import requests

# Function to download PDF data from the URL and save it locally
def download_pdf_data(pdf_url: str, file_name: str) -> None:
    response = requests.get(pdf_url, stream=True)
    # Ensure the data_dir exists before saving the file
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    # Save the PDF to the specified path in data_dir
    with open(os.path.join(data_dir, file_name), "wb") as file:
        for block in response.iter_content(chunk_size=1024):
            if block:  # If there's data to write, write it to the file
                file.write(block)

# Call the function to download the document
download_pdf_data(pdf_url, file_name)


In [33]:
# **chunk** | json
import pdfplumber
import json

def extract_pdf_text(file_name, file_json):
    with pdfplumber.open(file_name) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            pages.append({"page_num": page_num, "text": page_text})
    
    # Specify UTF-8 encoding here
    with open(file_json, "w", encoding="utf-8") as f:
        json.dump(pages, f, indent=4, ensure_ascii=False)

# Corrected paths
file_name = "./data/Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.pdf"
file_json = "./data/Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.json"
extract_pdf_text(file_name, file_json)


In [37]:
import os
import json
import torch
import numpy as np
from sentence_transformers import SentenceTransformer

# Ensure data_dir is properly defined
data_dir = "./data"

def generate_embeddings(file_json, embeddings_json, model):
    pages = []
    
    # Correct path to the JSON file, ensuring no redundant directory references
    file_path = os.path.join(data_dir, file_json)
    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_json} was not found in the directory {data_dir}")
    
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    for page in data:
        pages.append(page["text"])

    # Generate embeddings for each page of text
    embeddings = model.encode(pages)

    embeddings_paginated = []
    
    # Prepare the final structure: {"page": page_num, "embedding": embedded_text}
    for page_num in range(len(embeddings)):
        embeddings_paginated.append({
            "page_num": page_num, 
            "embedding": embeddings[page_num].tolist()  # Convert to list to make it JSON serializable
        })

    # Correct path for saving embeddings to JSON file
    embeddings_file_path = os.path.join(data_dir, embeddings_json)
    
    # Save the embeddings to a JSON file
    with open(embeddings_file_path, "w", encoding="utf-8") as file:
        json.dump(embeddings_paginated, file, indent=4, ensure_ascii=False)

# Define the model and device
model_name = "ipipan/silver-retriever-base-v1.1"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)

# Generate the embeddings and save them
generate_embeddings("Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.json", 
                    "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska-Embeddings.json", 
                    model)
