# Synthetic Multimodal Question Generation

In [1]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv
load_dotenv()

False

## 1. Read & Preprocess PDF file

---

### Split the PDFs into individual pages

In [2]:
import shutil, random
import openai
from unstructured.cleaners.core import clean_bullets, clean_extra_whitespace, remove_punctuation
from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader, UnstructuredAPIFileLoader
from langchain_community.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from langchain_ollama import ChatOllama
from util.common_utils import get_language_code

raw_data_dir = "../raw_data"

splitted_raw_data_dir = "splitted_raw_data"
file_path = f"{raw_data_dir}/pdf/vi-vneid.pdf"

DOMAIN = "VNeID Application Usage Guide in Vietnam"
LANGUAGE = "Vietnamese" # You can change your language here. e.g., "Korean", "Japanese", "Chinese"
LANGUAGE_CODE = get_language_code(LANGUAGE)
print(f"Domain: {DOMAIN}, Language: {LANGUAGE}, Language Code: {LANGUAGE_CODE}")

Domain: VNeID Application Usage Guide in Vietnam, Language: Vietnamese, Language Code: vi


In [3]:
# (Optional) Only use a poration of the PDF documents for testing. If there are a lot of pages or partial processing is required, cut and save only some pages.
import fitz

# Open the first PDF document
doc1 = fitz.open(file_path)
split_pages = [(7, 15)]

for idx, s in enumerate(split_pages):
    # Create a new empty PDF document
    doc2 = fitz.open()

    # Insert the first 2 pages of doc1 into doc2
    doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

    # Save the modified document
    doc2.save(f"{raw_data_dir}/part{idx}.pdf")

In [14]:
from util.common_utils import delete_folder_and_make_folder
from util.preprocess import remove_short_sentences, remove_small_images, split_pdf
from collections import defaultdict

file_path = f"{raw_data_dir}/part0.pdf"
file_path

'../raw_data/part0.pdf'

In [15]:
def analyze_pdf_page_content(pdf_path, text_length_thres=600):
    document = fitz.open(pdf_path)
    page_analysis = defaultdict(list)

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text = page.get_text("text")
        image_list = page.get_images(full=True)

        text_length = len(text)
        num_images = len(image_list)

        if text_length > text_length_thres and num_images == 0:
            content_type = 'Text'
        elif text_length <= text_length_thres and num_images > 0:
            content_type = 'Image'
        else:
            content_type = 'Mixed'            

        page_analysis[content_type].append(page_num)

    return dict(page_analysis)


In [16]:
analyzed_pdf_result = analyze_pdf_page_content(file_path, text_length_thres=100)
analyzed_pdf_result

{'Text': [0], 'Mixed': [1, 2, 3, 5, 6, 7], 'Image': [4, 8]}