In [38]:
import os
import fitz

import re
import json
from PIL import Image
from io import BytesIO

def extract_and_chunk_with_context(pdf_path, output_folder, target_size=(224, 224)):
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    extracted_data = []
    heading_regex = r"^[A-Z][A-Z0-9\s\-:]+$"
    supported_formats = ["jpeg", "jpg", "png"]
    current_chunk = {"type": "text", "content": "", "context": ""}
    current_heading = None

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text("text")
        lines = page_text.split("\n")
        for line in lines:
            if re.match(heading_regex, line.strip()):  
                if current_chunk["content"].strip():
                    extracted_data.append(current_chunk)
                    current_chunk = {"type": "text", "content": "", "context": ""}
                current_heading = line.strip()
                current_chunk["context"] = f"{current_heading} (Page {page_num + 1})"
            else:
                current_chunk["content"] += line + " "

       
        for img_idx, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image.get("image")
            image_ext = base_image.get("ext", "unknown").lower()

            if not image_bytes or image_ext not in supported_formats:
                print(f"Skipping unsupported or invalid image on Page {page_num + 1}, Image {img_idx + 1} (Format: {image_ext})")
                continue

            try:
              
                image = Image.open(BytesIO(image_bytes)).convert("RGB")
                image_resized = image.resize(target_size, Image.Resampling.LANCZOS)

                
                image_filename = f"page_{page_num + 1}_img_{img_idx + 1}.{image_ext}"
                image_path = os.path.join(output_folder, image_filename)
                image_resized.save(image_path)

            
                image_context = f"{current_heading or 'Page'} {page_num + 1}: Image {img_idx + 1}"
                extracted_data.append({
                    "type": "image",
                    "content": image_path,
                    "page_no": page_num + 1,
                    "context": image_context
                })
            except Exception as e:
                print(f"Error processing image on Page {page_num + 1}, Image {img_idx + 1}: {e}")
                continue

  
    if current_chunk["content"].strip():
        extracted_data.append(current_chunk)

    return extracted_data



pdf_path = r"D:\Projects\test-generator-llm-rag\jee-content\pyq\dokumen.pub_43-years-jee-advanced-1978-2020-jee-main-chapterwise-amp-topicwise-solved-papers-chemistry-16nbsped-8194767733-9788194767732.pdf"
output_folder = r"D:/Projects/test-generator-llm-rag/output_folder/PYQ16"
target_size = (224, 224)

chunked_data = extract_and_chunk_with_context(pdf_path, output_folder, target_size=target_size)

with open(os.path.join(output_folder, "chunked_data.json"), "w", encoding="utf-8") as f:
    json.dump(chunked_data, f, indent=4)

print("Extraction, preprocessing, and context-aware chunking complete!")


Extraction, preprocessing, and context-aware chunking complete!


In [70]:
from sentence_transformers import SentenceTransformer
import json
import os
output_folder = r"D:\Projects\test-generator-llm-rag\output_folder\PYQ16"
def embed_chunks(chunks):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    embeddings = []

    for chunk in chunks:
        if chunk["type"] == "text":
            embedding = model.encode(chunk["content"])
            embeddings.append({
                "embedding": embedding.tolist(),
                "content": chunk["content"],
                "context": chunk["context"]
            })

    return embeddings

with open(os.path.join(output_folder, "chunked_data.json"), "r", encoding="utf-8") as f:
    chunked_data = json.load(f)

embedded_data = embed_chunks(chunked_data)

with open(os.path.join(output_folder, "embedded_data.json"), "w", encoding="utf-8") as f:
    json.dump(embedded_data, f, indent=4)

print("Embedding complete!")


Embedding complete!


In [None]:
import pinecone


if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384)  

index = pinecone.Index(index_name)

def upload_to_pinecone(embedded_data):
    vectors = []
    for i, item in enumerate(embedded_data):
        vectors.append({
            "id": f"chunk-{i}",
            "values": item["embedding"],
            "metadata": {
                "content": item["content"],
                "context": item["context"]
            }
        })

    index.upsert(vectors=vectors)

upload_to_pinecone(embedded_data)

print("Data successfully uploaded to Pinecone!")


In [None]:
import os
import json
from pinecone import Pinecone, ServerlessSpec
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import torch

print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU detected:", torch.cuda.get_device_name(torch.cuda.current_device()))


api_key = "pcsk_4Bujbk_2GmM5SvXzqNjUiAmd1BHFZFYP8H2cpnXeogv7k2AKP1F8QUk4Q18xEpRu2AGcgX"
environment = "us-east-1"
pc = Pinecone(api_key=api_key)


index_name = "test-generator-1"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=512,  
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region=environment,
        )
    )
index = pc.Index(index_name)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def generate_and_store_embeddings(json_path, index):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for item in data:
        content_type = item["type"]
        context = item["context"]
        page_no = item["page_no"]

        try:
            if content_type == "text":
                text_content = item["content"]
                inputs = clip_processor(text=[text_content], return_tensors="pt", truncation=True)
                text_embedding = clip_model.get_text_features(**inputs).detach().numpy().flatten()

                index.upsert([
                    (f"text", text_embedding.tolist(), {"type": "text", "context": context})
                ])

            elif content_type == "image":
                image_path = item["content"]
                image = Image.open(image_path).convert("RGB")
                inputs = clip_processor(images=image, return_tensors="pt")
                image_embedding = clip_model.get_image_features(**inputs).detach().numpy().flatten()

                index.upsert([
                    (f"image-{page_no}-{os.path.basename(image_path)}", image_embedding.tolist(), {"type": "image", "context": context, "page_no": page_no})
                ])
        except Exception as e:
            print(f"Error processing {content_type}: {e}")

output_folder = r"D:\Projects\test-generator-llm-rag\output_folder"

def process_all_folders(output_folder, index):
    for folder_name in os.listdir(output_folder):
        folder_path = os.path.join(output_folder, folder_name)
        json_path = os.path.join(folder_path, "chunked_data.json")
        if os.path.exists(json_path):
            print(f"Processing and storing embeddings for: {json_path}")
            generate_and_store_embeddings(json_path, index)

process_all_folders(output_folder, index)

print("All embeddings stored in Pinecone!")


AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)