# Installation


In [None]:
%pip install -r requirements.txt

In [10]:
!conda install -c conda-forge poppler -y


Retrieving notices: ...working... done
Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\a1hmm\miniconda3\envs\torchENV

  added / updated specs:
    - poppler


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2026.1.4   |       h4c7d964_0         144 KB  conda-forge
    cairo-1.18.4               |       h477c42c_1         1.5 MB  conda-forge
    font-ttf-dejavu-sans-mono-2.37|       hab24e00_0         388 KB  conda-forge
    font-ttf-inconsolata-3.000 |       h77eed37_0          94 KB  conda-forge
    font-ttf-source-code-pro-2.038|       h77eed37_0         684 KB  conda-forge
    font-ttf-ubuntu-0.83       |       h77eed37_3         1.5 MB  conda-forge
    fontconfig-2.15.0          |       h765892d_1         

In [11]:
!pdfinfo -v


pdfinfo version 26.02.0
Copyright 2005-2026 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011, 2022 Glyph & Cog, LLC


# Imports

In [1]:
import os
from os.path import join
import random
from tqdm import tqdm
from glob import glob
from pdf2image import convert_from_path
from PIL import Image , ImageEnhance


# Paths

In [21]:
data_dir = os.path.join(os.getcwd(), "Data")

# Pdf to images

In [22]:
def preprocess_image(image , max_width = 600):
    gray_image = image.convert('L')
    if gray_image.width > max_width:
        ratio = max_width / gray_image.width
        new_height = int(gray_image.height * ratio)
        gray_image = gray_image.resize((max_width, new_height), Image.LANCZOS)
    
    enhancer = ImageEnhance.Contrast(gray_image)
    enhanced_image = enhancer.enhance(1.5)

    return enhanced_image


def convert_pdf_to_images(pdf_path, output_dir , max_width = 600):
    pdf_name = os.path.basename(pdf_path).split('.')[0]
    output_dire = join(output_dir , pdf_name)
    os.makedirs(output_dire, exist_ok=True)
    print(f"Processing {pdf_name}...")
    images = convert_from_path(pdf_path, dpi=200,
                               poppler_path=r"C:\Users\a1hmm\miniconda3\envs\torchENV\Library\bin")
    generated_pathes = []
    for i, image in enumerate(images , start=1):

        preprocessed_image = preprocess_image(image , max_width)
        image_path = join(output_dire, f"page_{i:03d}.jpg")
        preprocessed_image.save(image_path, format='JPEG', quality=85)
        print(f"Saved {image_path}")
        generated_pathes.append(image_path)

    return generated_pathes

In [23]:
sample_pdf = os.path.join(data_dir, "law_1.pdf")
output_dir = os.path.join(data_dir, "results")
os.makedirs(output_dir, exist_ok=True)


generated_images = convert_pdf_to_images(sample_pdf, output_dir)

Processing law_1...
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_001.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_002.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_003.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_004.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_005.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_006.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_007.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_008.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_009.jpg
Saved c:\Users\a1hmm\Desktop\Finetune-VLMs-for-Complex-Arabic-OCR\Data\results\law_1\page_010.jpg
