In [None]:
%pip install -q accelerate datasets peft tensorboard
%pip install  -U -q transformers trl datasets peft accelerate
%pip install -q flash-attn --no-build-isolation
%pip install pillow
%pip install scikit-learn

In [None]:
%pip install torch

In [None]:
import torch
import tqdm

from peft import PeftModel
from transformers import AutoProcessor, Idefics3ForConditionalGeneration

model_id = "HuggingFaceTB/SmolVLM-500M-Instruct" # Load the model here
adapter_save_path = ""   # Load the adapter here from the trained model

try:
    processor = AutoProcessor.from_pretrained(model_id)
    
    # Load the base model (disable flash_attn if ROCm has issues)
    base_model = Idefics3ForConditionalGeneration.from_pretrained(
        model_id,  
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,  # comment this if ROCm
        device_map="auto"
    )
    
    # Load the adapter
    trained_model = PeftModel.from_pretrained(
        base_model, 
        adapter_save_path,
        device_map="auto"
    )
    
    print("Model & adapter loaded successfully!")
    
except Exception as e:
    print(f"Error loading model: {e}")
    raise

In [None]:
%pip install pymupdf

In [None]:
import os
import fitz
from PIL import Image

pdf_path = "/home/test.pdf"  # Replace with your PDF path
output_dir = "/home/output_images/"  # Directory to save images
output_text_dir = "/home/Folder/"  # Directory to save markdown files

# Create output directories if they don't exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_text_dir, exist_ok=True)


# Convert PDF pages to images
doc = fitz.open(pdf_path)
image_paths = []
for i, page in enumerate(doc):
    pix = page.get_pixmap()
    img_path = os.path.join(output_dir, f"page_{i+1}.png")
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    image.save(img_path)
    image_paths.append(img_path)

import os
DEVICE="cuda"

# Process each image with the quantized model
for page_num, img_path in enumerate(image_paths, start=1):
    image = Image.open(img_path).convert("RGB")

    messages = [
        {"role": "system", "content": "Extract all the text from the image"
},
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Extract all the text from the image"}
            ]
        },
    ]

    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
   
    with torch.no_grad():
        output_ids = trained_model.generate(**inputs, max_new_tokens=2000)
        extracted_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

    # Remove the prompt from the extracted text
    # Assuming the prompt ends with a specific marker or can be identified by a pattern
    # Here, we assume the prompt ends with "Maintain original structure and formatting as closely as possible."
    prompt_end_marker = "Maintain original structure and formatting as closely as possible."
    prompt_end_index = extracted_text.find(prompt_end_marker)
    
    if prompt_end_index != -1:
        extracted_text_only = extracted_text[prompt_end_index + len(prompt_end_marker):].strip()
    else:
        extracted_text_only = extracted_text.strip()

    md_file_path = os.path.join(output_text_dir, f"page_{page_num}.md")
    with open(md_file_path, "w") as md_file:
        md_file.write(f"# Page {page_num}\n\n")
        md_file.write(extracted_text_only)

    print(f"Saved extracted text from page {page_num} to {md_file_path}")

print(f"Text extraction completed. Markdown files saved in {output_text_dir}")

In [None]:
%pip install markdown pdfkit
%pip install pypandoc

In [None]:
import os
import re
import pypandoc

# Replace with your folder
md_folder = r'/home/Folder/'
output_pdf = '/home/New.pdf'

# Get all .md files
md_files = [
    os.path.join(md_folder, f)
    for f in os.listdir(md_folder)
    if f.endswith('.md')
]

# Sort files based on the number in "page_XX"
md_files.sort(key=lambda x: int(re.search(r'page_(\d+)', x).group(1)))

# Combine all markdown files into one string
combined_md = ''
for file_path in md_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        combined_md += f.read() + '\n\n\\newpage\n\n'  # page break between files

# Convert to PDF using Pandoc
pypandoc.convert_text(
    combined_md,
    'pdf',
    format='md',
    outputfile=output_pdf,
    extra_args=['--pdf-engine=wkhtmltopdf']
)

print(" PDF created successfully:", output_pdf)