In [6]:
from azure.storage.blob import BlobServiceClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
)
from azure.search.documents.models import VectorizedQuery
from openai import AzureOpenAI
import os
import numpy as np
from dotenv import load_dotenv
import json
from langchain.text_splitter import MarkdownHeaderTextSplitter
load_dotenv()  # Load variables from .env file

True

In [2]:
def get_document_intelligence_client():
    endpoint = str(os.getenv("DOC_INT_ENDPOINT"))
    key = str(os.getenv("DOC_INT_KEY"))
    return DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )


In [3]:
def extract_document_structure(local_path):
    document_analysis_client = get_document_intelligence_client()
    with open(local_path, "rb") as f:
        # poller = document_analysis_client.begin_analyze_document(
        #     "prebuilt-document", document=f
        # )
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
    result = poller.result()

    # document_text = ""
    # for page in result.pages:
    #     for line in page.lines:
    #         document_text += line.content + "\n"

    # return document_text
    return result

In [7]:
doc = extract_document_structure('docs/test_ima.pdf')

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
# from sklearn.metrics.pairwise import cosine_similarity


# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

extracted_text = ""
for page in doc.pages:
    for line in page.lines:
        extracted_text += line.content + " "

def cosine_similarity(vec1, vec2):
    """
    Compute cosine similarity between two vectors.
    
    Parameters:
    vec1 (numpy array): First vector
    vec2 (numpy array): Second vector
    
    Returns:
    float: Cosine similarity between vec1 and vec2
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    return dot_product / (norm_vec1 * norm_vec2)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def semantic_chunking(text, max_chunk_size=1000, similarity_threshold=0.7):
    words = text.split()
    chunks = []
    current_chunk = []
    current_embedding = None

    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_chunk_size:
            chunk_text = " ".join(current_chunk)
            new_embedding = get_bert_embedding(chunk_text)
            
            if current_embedding is not None:
                similarity = cosine_similarity(current_embedding, new_embedding)
                if similarity < similarity_threshold:
                    chunks.append(" ".join(current_chunk[:-1]))
                    current_chunk = [current_chunk[-1]]
            
            current_embedding = new_embedding

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def extract_and_chunk_pdf_semantically(pdf_file_path):
    # Extract text from PDF
    # extracted_text = extract_document_structure(pdf_file_path)

    # Perform semantic chunking
    semantic_chunks = semantic_chunking(extracted_text)

    return semantic_chunks

# Example usage
pdf_path = "docs/test_ima.pdf"
chunks = extract_and_chunk_pdf_semantically(pdf_path)

# Print the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:")
    print(chunk)
    print("---")

Chunk 1:
---


In [1]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from PyPDF2 import PdfReader, PdfWriter

# Set up the Azure Document Intelligence client
endpoint = "YOUR_DOCUMENT_INTELLIGENCE_ENDPOINT"
key = "YOUR_DOCUMENT_INTELLIGENCE_KEY"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

# Path to your local PDF file
pdf_path = "path/to/your/local/file.pdf"

# Read the PDF
pdf_reader = PdfReader(pdf_path)

# Process each page separately
for page_num in range(len(pdf_reader.pages)):
    # Create a new PDF with just this page
    pdf_writer = PdfWriter()
    pdf_writer.add_page(pdf_reader.pages[page_num])
    
    # Save the single page as a temporary file
    temp_file_path = f"temp_page_{page_num + 1}.pdf"
    with open(temp_file_path, "wb") as temp_file:
        pdf_writer.write(temp_file)
    
    # Analyze the single page with Document Intelligence
    with open(temp_file_path, "rb") as temp_file:
        poller = document_analysis_client.begin_analyze_document("prebuilt-layout", document=temp_file)
        result = poller.result()
    
    # Extract and print the text from this page
    print(f"--- Page {page_num + 1} ---")
    page_text = " ".join([line.content for line in result.pages[0].lines])
    print(page_text)
    print("\n")
    
    # Clean up the temporary file
    os.remove(temp_file_path)

ModuleNotFoundError: No module named 'PyPDF2'