### import priors

In [None]:
import os
import faiss
import numpy as np
import pickle
from PyPDF2 import PdfReader
import fitz
import re
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity


### Download Natural Language Toolkit (tokeniser)

In [None]:
# Download nltk resources if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

### Extract text and create chunks

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file and filter unwanted text."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        state = False
        # Extract text from each page
        for page_num in range(len(doc)):
            page = doc[page_num]
            
            # Extract text blocks
            text_blocks = page.get_text("blocks")
            for block in text_blocks:
                # Check if block is text (not image)
                if block[6] == 0:  # 0 indicates text block
                    para = block[4].replace('\n', '')

                    # Filter out unwanted text
                    temp0 = re.search('references', para.lower())
                    temp1 = re.search('conclusion', para.lower())
                    temp2 = re.search('acknowledgments', para.lower())
                    if 'figure' in (para[:15]).lower():
                        continue
                    elif 'table' in para[:15].lower():
                        continue
                    elif temp0 and temp0.start() < 10:
                        state = True
                        break
                    elif temp1 and temp1.start() < 10:
                        state = True
                        break
                    elif temp2 and temp2.start() < 10:
                        state = True
                        break
                    # elif len(para) < 30:
                    #     continue
                    text += para + "\n"
            if state:
                break
        
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def semantic_chunking(text, model, max_chunk_size=1000, min_chunk_size=200, similarity_threshold=0.7):
    """
    Chunk text using semantic similarity between sentences to form coherent paragraphs.
    
    Args:
        text: Text to chunk
        model: SentenceTransformer model for encoding sentences
        max_chunk_size: Maximum size of a chunk in characters
        min_chunk_size: Minimum size of a chunk in characters
        similarity_threshold: Threshold for semantic similarity to group sentences
        
    Returns:
        List of semantically coherent text chunks
    """
    # Split into sentences
    sentences = sent_tokenize(text)
    if len(sentences) <= 1:
        return [text] if text.strip() else []
        
    # Get embeddings for each sentence
    sentence_embeddings = model.encode(sentences)
    
    # Normalize embeddings for cosine similarity
    sentence_embeddings = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
    
    chunks = []
    current_chunk = [sentences[0]]
    current_chunk_size = len(sentences[0])
    current_avg_embedding = sentence_embeddings[0].reshape(1, -1)
    
    for i in range(1, len(sentences)):
        sentence = sentences[i]
        sentence_size = len(sentence)
        
        # Check semantic similarity with current chunk
        sentence_embedding = sentence_embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(current_avg_embedding, sentence_embedding)[0][0]
        
        # Decision to add to current chunk or start a new one
        add_to_current = True
        
        # If adding this would exceed max size, start a new chunk
        if current_chunk_size + sentence_size > max_chunk_size:
            add_to_current = False
        # If similarity is below threshold and we have a decent chunk size already, start a new one
        elif similarity < similarity_threshold and current_chunk_size >= min_chunk_size:
            add_to_current = False
            
        if add_to_current:
            # Add sentence to current chunk
            current_chunk.append(sentence)
            current_chunk_size += sentence_size
            
            # Update average embedding
            n = len(current_chunk)
            current_avg_embedding = ((n-1) * current_avg_embedding + sentence_embedding) / n
        else:
            # Complete current chunk and start a new one
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_chunk_size = sentence_size
            current_avg_embedding = sentence_embedding
    
    # Add the final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def process_pdfs_semantic(data_dir, model, max_chunk_size=1000, min_chunk_size=200, similarity_threshold=0.7):
    """
    Process all PDFs in the directory, creating semantic chunks, and return chunks with metadata.
    
    Args:
        data_dir: Directory containing PDF files
        model: SentenceTransformer model
        max_chunk_size: Maximum size of a chunk in characters
        min_chunk_size: Minimum size of a chunk in characters
        similarity_threshold: Threshold for semantic similarity to group sentences
        
    Returns:
        chunks: List of text chunks
        chunk_metadata: List of dictionaries with metadata for each chunk
    """
    pdf_files = [f for f in os.listdir(data_dir) if f.lower().endswith('.pdf')]
    chunks = []
    chunk_metadata = []
    
    print(f"Found {len(pdf_files)} PDF files in {data_dir}")
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(data_dir, pdf_file)
        text = extract_text_from_pdf(pdf_path)
    
        if not text.strip():
            continue
            
        # Create semantic chunks
        document_chunks = semantic_chunking(
            text, 
            model, 
            max_chunk_size=max_chunk_size, 
            min_chunk_size=min_chunk_size,
            similarity_threshold=similarity_threshold
        )
        
        # Add chunks and their metadata
        for i, chunk in enumerate(document_chunks):
            if chunk.strip():
                chunks.append(chunk)
                chunk_metadata.append({
                    "file_path": pdf_path,
                    "file_name": pdf_file,
                    "chunk_index": i,
                    "total_chunks": len(document_chunks),
                    "chunk_length": len(chunk)
                })
    
    print(f"Created {len(chunks)} semantic chunks from {len(pdf_files)} PDF files")
    return chunks, chunk_metadata

data_dir = "data"  # Directory containing PDF files
output_dir = "vector_db"  # Directory to save the vector database

# Chunk parameters
max_chunk_size = 5000  # Maximum characters per chunk
min_chunk_size = 800   # Minimum characters per chunk
similarity_threshold = 0.7  # Similarity threshold for chunking

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load model once to use for both chunking and embedding
model_name = "all-MiniLM-L6-v2"
print(f"Loading SentenceTransformer model: {model_name}")
model = SentenceTransformer(model_name)

# Process PDFs with semantic chunking
chunks, chunk_metadata = process_pdfs_semantic(
    data_dir, 
    model, 
    max_chunk_size=max_chunk_size, 
    min_chunk_size=min_chunk_size,
    similarity_threshold=similarity_threshold
)

if not chunks:
    raise ValueError('No valid text chunks found.')



### Create Faiss vector Database

In [None]:

def create_faiss_index(chunks, model):
    """Create a FAISS index from text chunks."""
    # Create embeddings
    print("Creating embeddings for chunks...")
    embeddings = model.encode(chunks, show_progress_bar=True)
    
    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)
    
    # Create the FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity with normalized vectors
    index.add(embeddings)
    
    return index

print(f"Creating vector database from {len(chunks)} semantic chunks...")

# Create FAISS index
index = create_faiss_index(chunks, model)


### Save Vector Database and metadata

In [None]:

# Save the index and metadata
index_path = os.path.join(output_dir, "faiss_index.bin")
faiss.write_index(index, index_path)

# Save metadata
metadata = {
    "chunk_metadata": chunk_metadata,
    "model_name": model.get_sentence_embedding_dimension(),
    "max_chunk_size": max_chunk_size,
    "min_chunk_size": min_chunk_size,
    "similarity_threshold": similarity_threshold,
    "chunks": chunks  # Store the actual text chunks for retrieval
}

with open(os.path.join(output_dir, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata, f)

print(f"Vector database created and saved to {output_dir}")
print(f"Number of semantic chunks indexed: {len(chunks)}")
print(f"Vector dimension: {model.get_sentence_embedding_dimension()}")
print(f"Chunk parameters: max_size={max_chunk_size}, min_size={min_chunk_size}, similarity_threshold={similarity_threshold}")
