<a href="https://colab.research.google.com/github/akash1629/Automate-Document-Summarization-with-Multi-modal-Data/blob/main/Multimodal_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================
# 1) ENVIRONMENT SETUP (INSTALLATIONS)
# =====================================
!pip install torch torchvision torchaudio
!pip install transformers sentencepiece
!pip install sentence-transformers
!pip install PyMuPDF
!pip install opencv-python
!pip install scikit-learn
!pip install evaluate  # for ROUGE, BLEU, etc. (Hugging Face's "evaluate" library)
!pip install rouge_score
!pip install fpdf2

# =====================================
# 2) IMPORTS
# =====================================
import os
import re
import fitz  # PyMuPDF
import cv2
import numpy as np
import torch
import evaluate
from PIL import Image, ImageDraw, ImageFont
from transformers import CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from fpdf import FPDF

# (Optional) For file upload in Colab:
try:
    from google.colab import files
except ImportError:
    files = None  # Not running in Colab

# =====================================
# 3) GLOBAL SETUP (DEVICE, MODELS, ETC.)
# =====================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ---------- CLIP Model (Image Embeddings) ----------
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# ---------- Sentence Transformer (Text Embeddings) ----------
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# ---------- T5 Summarization Model ----------
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# ---------- Evaluation Metrics (ROUGE, BLEU) ----------
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")


# =====================================
# 4) UTILITY FUNCTIONS
# =====================================
def extract_text_and_images_from_pdf(pdf_path, output_image_dir="images_output"):
    """
    Extracts text and images from each page of the PDF.
    Returns a combined text string and a list of extracted image file paths.
    """
    if not os.path.exists(output_image_dir):
        os.makedirs(output_image_dir)

    doc = fitz.open(pdf_path)
    full_text = []
    image_paths = []

    for page_index in range(len(doc)):
        page = doc[page_index]
        text = page.get_text()
        full_text.append(text)

        # Extract images
        image_list = page.get_images(full=True)
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            ext = base_image["ext"].lower()
            if ext not in ["jpg", "jpeg", "png"]:
                continue

            imgdata = base_image["image"]
            img_filename = f"page_{page_index}_img_{image_index}.{ext}"
            img_path = os.path.join(output_image_dir, img_filename)

            with open(img_path, "wb") as f:
                f.write(imgdata)

            image_paths.append(img_path)

    doc.close()
    combined_text = " ".join(full_text)
    return combined_text, image_paths


def clean_text(text):
    """
    Basic cleaning of text before summarization.
    Removes excessive whitespaces/newlines.
    """
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def create_prompt_for_summarization(text, style="concise"):
    """
    Prompt engineering helper to guide summarization style:
    'concise', 'detailed', etc.
    """
    if style == "concise":
        prompt = (
            "Summarize the following text in a very concise manner:\n\n"
            f"{text}\n\nSummary:\n"
        )
    elif style == "detailed":
        prompt = (
            "Provide a detailed summary of the following text:\n\n"
            f"{text}\n\nDetailed Summary:\n"
        )
    else:
        prompt = (
            f"Summarize the following text:\n\n{text}\n\nSummary:\n"
        )
    return prompt


def get_image_embedding(image_paths):
    """
    Returns a list of image embeddings using CLIP for each image path.
    """
    embeddings = []
    for img_path in image_paths:
        image = Image.open(img_path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = clip_model.get_image_features(**inputs)
        embedding = outputs / outputs.norm(p=2, dim=-1, keepdim=True)
        embeddings.append(embedding.squeeze().cpu().numpy())
    return embeddings


def get_text_embedding(text):
    """
    Returns a single text embedding (Sentence Transformers).
    For very large texts, consider splitting or chunking.
    """
    with torch.no_grad():
        embedding = text_model.encode(text, convert_to_tensor=True, device=device)
    return embedding.cpu().numpy()


def fuse_embeddings(text_embedding, image_embeddings):
    """
    Returns a fused embedding:
        fused = (text_embedding + mean_of_image_embeddings) / 2
    If no images, returns text embedding only.
    """
    if len(image_embeddings) == 0:
        return text_embedding

    image_embeddings_mean = np.mean(image_embeddings, axis=0)

    # If shape mismatch, truncate to smallest dimension
    if text_embedding.shape != image_embeddings_mean.shape:
        min_dim = min(text_embedding.shape[0], image_embeddings_mean.shape[0])
        text_embedding = text_embedding[:min_dim]
        image_embeddings_mean = image_embeddings_mean[:min_dim]

    fused = (text_embedding + image_embeddings_mean) / 2.0
    return fused


def summarize_text_with_t5(text, max_length=100, style_prompt=None):
    """
    Summarize text using T5. Incorporates style prompts.
    """
    if style_prompt is not None:
        text = create_prompt_for_summarization(text, style=style_prompt)

    input_ids = t5_tokenizer.encode(
        "summarize: " + text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    summary_ids = t5_model.generate(
        input_ids,
        num_beams=4,
        no_repeat_ngram_size=2,
        min_length=30,
        max_length=max_length,
        early_stopping=True
    )

    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


def multi_modal_summarization_pipeline(pdf_path, style_prompt="concise"):
    """
    End-to-end pipeline:
      1) Extract text & images
      2) Clean text
      3) Create text embedding
      4) Create image embeddings
      5) Fuse embeddings
      6) Summarize text with T5
    Returns: (generated_summary, fused_embedding)
    """
    raw_text, image_paths = extract_text_and_images_from_pdf(pdf_path)
    cleaned_text = clean_text(raw_text)
    text_emb = get_text_embedding(cleaned_text)
    img_embs = get_image_embedding(image_paths)
    fused_emb = fuse_embeddings(text_emb, img_embs)
    summary_output = summarize_text_with_t5(
        cleaned_text,
        max_length=150,  # Increase if you want a longer summary
        style_prompt=style_prompt
    )
    return summary_output, fused_emb


def evaluate_summary(reference, generated):
    """
    Evaluates summary quality using ROUGE and BLEU metrics.
    """
    rouge_scores = rouge_metric.compute(
        references=[reference],
        predictions=[generated]
    )

    bleu_scores = bleu_metric.compute(
        references=[[reference]],
        predictions=[generated]
    )

    return {
        "rouge": rouge_scores,
        "bleu": bleu_scores
    }


# =====================================
# 5) OPTIONAL: CREATE A SAMPLE PDF
# =====================================
def create_sample_pdf(pdf_output_path="sample_document.pdf", image_dir="generated_images"):
    """
    Creates a sample PDF with text and images for demonstration/testing.
    """
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    def create_sample_image(text, filename):
        img = Image.new('RGB', (200, 100), color=(73, 109, 137))
        d = ImageDraw.Draw(img)
        try:
            font = ImageFont.truetype("arial.ttf", 15)
        except IOError:
            font = ImageFont.load_default()
        text_bbox = d.textbbox((0, 0), text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        position = ((200 - text_width) / 2, (100 - text_height) / 2)
        d.text(position, text, fill=(255, 255, 0), font=font)
        img.save(filename)

    image1_path = os.path.join(image_dir, "image1.png")
    image2_path = os.path.join(image_dir, "image2.png")

    create_sample_image("Sample Image 1", image1_path)
    create_sample_image("Sample Image 2", image2_path)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(0, 10, "Sample PDF Document", ln=True, align='C')
    pdf.ln(10)

    pdf.set_font("Arial", size=12)
    sample_text = (
        "This is a sample PDF document created for testing purposes.\n\n"
        "It contains multiple pages with text and images to demonstrate "
        "the capabilities of the multi-modal summarization pipeline."
    )
    pdf.multi_cell(0, 10, sample_text)
    pdf.ln(10)
    pdf.image(image1_path, x=10, y=None, w=100)

    pdf.add_page()
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(0, 10, "Second Page", ln=True, align='L')
    pdf.ln(10)
    more_text = (
        "This second page includes another image and additional text to enrich the document.\n\n"
        "The purpose is to provide enough content for the summarization pipeline to process."
    )
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, more_text)
    pdf.ln(10)
    pdf.image(image2_path, x=10, y=None, w=100)

    pdf.output(pdf_output_path)
    print(f"Sample PDF '{pdf_output_path}' created successfully.")


# =====================================
# 6) DEMO / MAIN EXECUTION
# =====================================
if __name__ == "__main__":
    # If you want to create a sample PDF:
    #create_sample_pdf()  # comment out if you don't need a sample PDF

    # -----------------------------------
    # A) CHOOSE YOUR PDF
    # -----------------------------------
    # 1) In COLAB, let user upload a PDF
    pdf_file_path = None
    if files is not None:
        print("\nIf you want to upload your own PDF, please choose it now...")
        uploaded_files = files.upload()
        # If a file was uploaded, take the first file name
        if uploaded_files:
            pdf_file_path = list(uploaded_files.keys())[0]
            print("Uploaded PDF:", pdf_file_path)

    # 2) If no file was uploaded or not in Colab, just use the sample PDF
    if not pdf_file_path:
        pdf_file_path = "sample_document.pdf"

    # Make sure the file actually exists
    if not os.path.isfile(pdf_file_path):
        print(f"\nError: '{pdf_file_path}' does not exist. Please check the file path.")
    else:
        print(f"\nUsing PDF file: {pdf_file_path}")

        # -----------------------------------
        # B) RUN THE MULTI-MODAL PIPELINE
        # -----------------------------------
        generated_summary, fused_embedding = multi_modal_summarization_pipeline(
            pdf_file_path,
            style_prompt="concise"  # can be "detailed", etc.
        )

        # Print the summary
        print("\n===== GENERATED SUMMARY =====\n", generated_summary)

        # Print fused embedding info
        print("\n===== FUSED EMBEDDING SHAPE =====\n", fused_embedding.shape)

        # -----------------------------------
        # C) (Optional) EVALUATE SUMMARY
        # -----------------------------------
        reference_summary = (
            "This sample PDF contains multiple pages with text and images "
            "to demonstrate the multi-modal summarization pipeline."
        )
        scores = evaluate_summary(reference_summary, generated_summary)
        print("\n===== EVALUATION SCORES =====\n", scores)

        # At this point, you can repeat the pipeline with a different PDF
        # without changing the code—just upload or specify your new PDF path!


Using device: cpu

If you want to upload your own PDF, please choose it now...


Saving s41598-024-74668-y.pdf to s41598-024-74668-y (1).pdf
Uploaded PDF: s41598-024-74668-y (1).pdf

Using PDF file: s41598-024-74668-y (1).pdf

===== GENERATED SUMMARY =====
 dual syntax aware graph attention networks with prompt for aspect-based sentiment analysis (ABSA) is a challenging task due to the presence of multiple aspect words with different sentiment polarities. Graph neural networks have also been employed to extract syntactic and semantic information from sentence parsing trees, resulting in superior results. however, dependency trees may establish irrelevant dependencies for sentences with irregular syntax and complex structures.

===== FUSED EMBEDDING SHAPE =====
 (384,)

===== EVALUATION SCORES =====
 {'rouge': {'rouge1': 0.12048192771084337, 'rouge2': 0.0, 'rougeL': 0.07228915662650603, 'rougeLsum': 0.07228915662650603}, 'bleu': {'bleu': 0.0, 'precisions': [0.08333333333333333, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 4.235294117647059, 'translation_l

In [None]:
# 1. Install Dependencies
!pip install torch torchvision torchaudio --quiet
!pip install transformers sentencepiece sentence-transformers --quiet
!pip install PyMuPDF --quiet
!pip install opencv-python --quiet
!pip install scikit-learn --quiet
!pip install evaluate --quiet
!pip install rouge_score --quiet
!pip install fpdf2 --quiet
!pip install gradio --quiet

# 2. Import Libraries
import os
import re
import fitz  # PyMuPDF
import numpy as np
import torch
import evaluate
from PIL import Image, ImageDraw, ImageFont
from transformers import CLIPProcessor, CLIPModel, T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from fpdf import FPDF
import gradio as gr
import warnings
warnings.filterwarnings("ignore")

# 3. Load Models
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# 4. Define Utility Functions
def extract_text_and_images_from_pdf(pdf_path, output_image_dir="images_output"):
    if not os.path.exists(output_image_dir):
        os.makedirs(output_image_dir)

    doc = fitz.open(pdf_path)
    full_text = []
    image_paths = []

    for page_index in range(len(doc)):
        page = doc[page_index]
        text = page.get_text()
        full_text.append(text)

        # Extract images
        image_list = page.get_images(full=True)
        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            ext = base_image["ext"].lower()
            if ext not in ["jpg", "jpeg", "png"]:
                continue

            imgdata = base_image["image"]
            img_filename = f"page_{page_index}_img_{image_index}.{ext}"
            img_path = os.path.join(output_image_dir, img_filename)

            with open(img_path, "wb") as f:
                f.write(imgdata)

            image_paths.append(img_path)

    doc.close()
    combined_text = " ".join(full_text)
    return combined_text, image_paths

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def create_prompt_for_summarization(text, style="concise"):
    if style == "concise":
        prompt = (
            "Summarize the following text in a very concise manner:\n\n"
            f"{text}\n\nSummary:\n"
        )
    elif style == "detailed":
        prompt = (
            "Provide a detailed summary of the following text:\n\n"
            f"{text}\n\nDetailed Summary:\n"
        )
    else:
        prompt = (
            f"Summarize the following text:\n\n{text}\n\nSummary:\n"
        )
    return prompt

def get_image_embedding(image_paths):
    embeddings = []
    for img_path in image_paths:
        image = Image.open(img_path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = clip_model.get_image_features(**inputs)
        embedding = outputs / outputs.norm(p=2, dim=-1, keepdim=True)
        embeddings.append(embedding.squeeze().cpu().numpy())
    return embeddings

def get_text_embedding(text):
    with torch.no_grad():
        embedding = text_model.encode(text, convert_to_tensor=True, device=device)
    return embedding.cpu().numpy()

def fuse_embeddings(text_embedding, image_embeddings):
    if len(image_embeddings) == 0:
        return text_embedding

    image_embeddings_mean = np.mean(image_embeddings, axis=0)

    # If shape mismatch, truncate to smallest dimension
    if text_embedding.shape != image_embeddings_mean.shape:
        min_dim = min(text_embedding.shape[0], image_embeddings_mean.shape[0])
        text_embedding = text_embedding[:min_dim]
        image_embeddings_mean = image_embeddings_mean[:min_dim]

    fused = (text_embedding + image_embeddings_mean) / 2.0
    return fused

def summarize_text_with_t5(text, max_length=150, style_prompt=None):
    if style_prompt is not None:
        text = create_prompt_for_summarization(text, style=style_prompt)

    input_ids = t5_tokenizer.encode(
        "summarize: " + text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    summary_ids = t5_model.generate(
        input_ids,
        num_beams=4,
        no_repeat_ngram_size=2,
        min_length=30,
        max_length=max_length,
        early_stopping=True
    )

    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def multi_modal_summarization_pipeline(pdf_path, style_prompt="concise"):
    raw_text, image_paths = extract_text_and_images_from_pdf(pdf_path)
    cleaned_text = clean_text(raw_text)
    text_emb = get_text_embedding(cleaned_text)
    img_embs = get_image_embedding(image_paths)
    fused_emb = fuse_embeddings(text_emb, img_embs)
    summary_output = summarize_text_with_t5(
        cleaned_text,
        max_length=150,
        style_prompt=style_prompt
    )
    return summary_output, fused_emb

def evaluate_summary(reference, generated):
    rouge_scores = rouge_metric.compute(
        references=[reference],
        predictions=[generated]
    )

    bleu_scores = bleu_metric.compute(
        references=[[reference]],
        predictions=[generated]
    )

    return {
        "rouge": rouge_scores,
        "bleu": bleu_scores
    }

def create_sample_pdf(pdf_output_path="sample_document.pdf", image_dir="generated_images"):
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    def create_sample_image(text, filename):
        img = Image.new('RGB', (200, 100), color=(73, 109, 137))
        d = ImageDraw.Draw(img)
        try:
            font = ImageFont.truetype("arial.ttf", 15)
        except IOError:
            font = ImageFont.load_default()
        text_bbox = d.textbbox((0, 0), text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        position = ((200 - text_width) / 2, (100 - text_height) / 2)
        d.text(position, text, fill=(255, 255, 0), font=font)
        img.save(filename)

    image1_path = os.path.join(image_dir, "image1.png")
    image2_path = os.path.join(image_dir, "image2.png")

    create_sample_image("Sample Image 1", image1_path)
    create_sample_image("Sample Image 2", image2_path)

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(0, 10, "Sample PDF Document", ln=True, align='C')
    pdf.ln(10)

    pdf.set_font("Arial", size=12)
    sample_text = (
        "This is a sample PDF document created for testing purposes.\n\n"
        "It contains multiple pages with text and images to demonstrate "
        "the capabilities of the multi-modal summarization pipeline."
    )
    pdf.multi_cell(0, 10, sample_text)
    pdf.ln(10)
    pdf.image(image1_path, x=10, y=None, w=100)

    pdf.add_page()
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(0, 10, "Second Page", ln=True, align='L')
    pdf.ln(10)
    more_text = (
        "This second page includes another image and additional text to enrich the document.\n\n"
        "The purpose is to provide enough content for the summarization pipeline to process."
    )
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, more_text)
    pdf.ln(10)
    pdf.image(image2_path, x=10, y=None, w=100)

    pdf.output(pdf_output_path)
    print(f"Sample PDF '{pdf_output_path}' created successfully.")

# 5. Create Gradio Interface Function with Enhanced Error Handling
def pdf_summarizer(file, style="concise"):
    """
    Gradio interface function.
    Accepts a PDF file and a summary style, returns the summary.
    """
    if file is None:
        return "Please upload a PDF file.", "No error."

    # Save the uploaded file to a temporary location
    temp_pdf_path = "uploaded.pdf"
    try:
        with open(temp_pdf_path, "wb") as f:
            f.write(file)
    except Exception as e:
        return "Failed to save the uploaded PDF.", f"Error: {e}"

    try:
        # Run the summarization pipeline
        summary, _ = multi_modal_summarization_pipeline(temp_pdf_path, style_prompt=style)
    except Exception as e:
        return "Failed to generate summary.", f"Error: {e}"
    finally:
        # Clean up the uploaded file
        if os.path.exists(temp_pdf_path):
            os.remove(temp_pdf_path)

    return summary, "No error."

# 6. Optional: Create a Sample PDF
create_sample_pdf()

# 7. Define Gradio Interface
iface = gr.Interface(
    fn=pdf_summarizer,
    inputs=[
        gr.File(label="Upload PDF", type="binary", file_types=[".pdf"]),  # Correct 'type' to 'binary'
        gr.Dropdown(choices=["concise", "detailed"], value="concise", label="Summary Style")
    ],
    outputs=[
        gr.Textbox(label="Summary", lines=10),
        gr.Textbox(label="Error Message", lines=2, interactive=False)  # Placeholder for error messages
    ],
    title="📄 PDF Summarizer",
    description="Upload a PDF file and receive a summary instantly!",
    examples=[
        ["sample_document.pdf", "concise"],
        ["sample_document.pdf", "detailed"]
    ],
    allow_flagging="never",

)

# 8. Launch the Interface
iface.launch(share=True)


Using device: cpu
Sample PDF 'sample_document.pdf' created successfully.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a70cd2f5f1b2bf6152.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


