In [1]:
!pip install pymupdf python-docx torch pillow transformers ipython 


Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyMuPDFb==1.24.10 (from pymupdf)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pa

In [4]:
import os
import io
import fitz  # PyMuPDF for PDF extraction
import docx  # for DOCX extraction
import tensorflow as tf
import torch
from PIL import Image
from transformers import MBartTokenizer, TFMBartForConditionalGeneration
from transformers import BlipProcessor, BlipForConditionalGeneration

# Check GPU availability for TensorFlow
print("Num GPUs Available for TensorFlow: ", len(tf.config.list_physical_devices('GPU')))

# Check GPU availability for PyTorch
device1 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device2 = torch.device("cuda:1" if torch.cuda.device_count() > 1 else device1)
print(f"Using device for MBart (TensorFlow): {device1}")
print(f"Using device for BLIP (PyTorch): {device2}")

# Initialize MBart model and tokenizer (running on TensorFlow, forcing to GPU 0)
model_name = "facebook/mbart-large-50"
tokenizer = MBartTokenizer.from_pretrained(model_name)

with tf.device('/GPU:0'):
    model = TFMBartForConditionalGeneration.from_pretrained(model_name)

# Initialize BLIP model and processor (running on PyTorch, using GPU 1 if available)
blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device2)

# Summarization function (forcing MBart to run on GPU 0)
def summarize_text(text):
    inputs = tokenizer(text, return_tensors="tf", max_length=512, truncation=True)

    # Running summarization on GPU 0
    with tf.device('/GPU:0'):
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=250, 
            min_length=100, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Image description function (using BLIP with PyTorch on GPU 1)
def describe_image(image):
    # Convert the image object to bytes
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format=image.format)
    img_byte_arr = img_byte_arr.getvalue()

    # Pass the bytes to BLIP processor
    image = Image.open(io.BytesIO(img_byte_arr))
    inputs = blip_processor(images=image, return_tensors="pt").to(device2)  # Force to GPU 1
    
    with torch.no_grad():
        out = blip_model.generate(**inputs)
    description = blip_processor.decode(out[0], skip_special_tokens=True)
    return description

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text.strip()
    except Exception as e:
        return f"Error extracting text from PDF: {e}"

# Extract text from DOCX
def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text.strip()
    except Exception as e:
        return f"Error extracting text from DOCX: {e}"

# Extract images from PDF
def extract_images_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        images = []
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))
                images.append(image)
        return images
    except Exception as e:
        return f"Error extracting images from PDF: {e}"

# Extract images from DOCX
def extract_images_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        images = []
        for rel in doc.part.rels.values():
            if "image" in rel.target_ref:
                image_bytes = rel.target_part.blob
                image = Image.open(io.BytesIO(image_bytes))
                images.append(image)
        return images
    except Exception as e:
        return f"Error extracting images from DOCX: {e}"

# File path handling in Kaggle
input_dir = "/kaggle/input/french123"
uploaded_files = os.listdir(input_dir)

for filename in uploaded_files:
    file_path = os.path.join(input_dir, filename)
    
    if filename.lower().endswith('.pdf'):
        # Extract text from PDF
        text = extract_text_from_pdf(file_path)
        # Summarize text
        summary = summarize_text(text)
        print(f"Summary for {filename}:\n{summary}\n")
        
        # Extract and describe images from PDF
        images = extract_images_from_pdf(file_path)
        if isinstance(images, list):
            for i, img in enumerate(images):
                try:
                    description = describe_image(img)
                    print(f"Image {i + 1} Description for {filename}: {description}\n")
                except Exception as e:
                    print(f"Error describing image {i + 1} in {filename}: {e}\n")
        else:
            print(images)

    elif filename.lower().endswith('.docx'):
        # Extract text from DOCX
        text = extract_text_from_docx(file_path)
        # Summarize text
        summary = summarize_text(text)
        print(f"Summary for {filename}:\n{summary}\n")
        
        # Extract and describe images from DOCX
        images = extract_images_from_docx(file_path)
        if isinstance(images, list):
            for i, img in enumerate(images):
                try:
                    description = describe_image(img)
                    print(f"Image {i + 1} Description for {filename}: {description}\n")
                except Exception as e:
                    print(f"Error describing image {i + 1} in {filename}: {e}\n")
        else:
            print(images)

    else:
        print(f"Unsupported file format for {filename}")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Num GPUs Available for TensorFlow:  2
Using device for MBart (TensorFlow): cuda:0
Using device for BLIP (PyTorch): cuda:1


All model checkpoint layers were used when initializing TFMBartForConditionalGeneration.

All the layers of TFMBartForConditionalGeneration were initialized from the model checkpoint at facebook/mbart-large-50.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMBartForConditionalGeneration for predictions without further training.


Summary for crime-et-chatiment-fedor-mikhailovitch-distoievski.pdf:
SYNOPSIS DE CRIME ET CHÂTIMENT Fedor M.Dostoïevski InfoLivres.org SYNOPSIS DE CRIME ET CHÂTIMENT Crime et châtiment est un roman psychologique de Fyodor Dostoïevski, le célèbre écrivain russe. Il a été publié en 1866 et est considéré comme un classique de la littérature russe. Tout au long de l'intrigue, nous suivons le parcours de Raskolnikov, un jeune homme de Saint-Pétersbourg, dont la complexité psychologique parvient à saisir le lecteur d'une manière particulière.

Image 1 Description for crime-et-chatiment-fedor-mikhailovitch-distoievski.pdf: a qr code with a black and white image

Image 2 Description for crime-et-chatiment-fedor-mikhailovitch-distoievski.pdf: a qr code with a qr code on it

