In [1]:
import os
import json
import torch
import faiss
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
import torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"✅ Using model: {model_name} on device: {device}")

✅ Using model: sentence-transformers/all-MiniLM-L6-v2 on device: cpu


In [5]:
data_dir = "data"

In [6]:
if os.path.exists(data_dir):
    for filename in os.listdir(data_dir):
        print(filename)
else:
    print(f"Directory '{data_dir}' does not exist.")

guideline-170-en.pdf
Current Essentials of Medicine(1)(1).pdf
LN_Pediatrics_final.pdf
essentials-of-human-nutrition1.pdf
disease-handbook-complete.pdf
Gerontological Nursing.pdf
FirstAid-manual.pdf
Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive.com ).pdf


In [7]:
def read_pdfs(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            path = os.path.join(directory, filename)
            doc = fitz.open(path)
            text = [doc[i].get_text() for i in range(doc.page_count)]
            documents.append({"source": filename, "text": text})
            doc.close()
    return documents

In [8]:
documents = read_pdfs(data_dir)

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

In [10]:
all_chunks = []
metadata = []

In [11]:
for doc in documents:
    source = doc["source"]
    for page_num, page_text in enumerate(doc["text"]):
        chunks = text_splitter.split_text(page_text)
        for chunk_index, chunk in enumerate(chunks):
            all_chunks.append(chunk)
            metadata.append({
                "pdf_name": source,
                "pdf_page": page_num,
                "chunk_index": chunk_index
            })

print(f"📊 Total chunks: {len(all_chunks)}")

📊 Total chunks: 12717


In [12]:
embedding_dim = model.get_sentence_embedding_dimension()
print(f"📐 Embedding dimension: {embedding_dim}")

📐 Embedding dimension: 384


In [13]:
index = faiss.IndexFlatL2(embedding_dim)

In [14]:
batch_size = 32
for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    embeddings = model.encode(batch, convert_to_numpy=True, device=device)
    index.add(embeddings)
    print(f"✅ Processed {i + len(batch)} / {len(all_chunks)} chunks", end="\r")

✅ Processed 12717 / 12717 chunks

In [15]:
faiss.write_index(index, "vector_index.faiss")

In [16]:
with open("metadata.json", "w") as f:
    json.dump(metadata, f)

print("\nVector database creation complete.")


Vector database creation complete.
