In [12]:
%pip install PyPDF2 summa googletrans==4.0.0-rc1 pytesseract fpdf2 langdetect pillow

import re
import os
import PyPDF2
import summa
import pytesseract
from PIL import Image
import io
from fpdf import FPDF
from googletrans import Translator
from langdetect import detect
import asyncio


class MultilingualPDFProcessor:
    def __init__(self, font_path="/content/DejaVuSans.ttf"):
        self.translator = Translator()
        self.num_pages = 0
        self.pdf_text = ""
        self.words = set()
        self.summary = ""
        self.font_path = font_path  # ✅ use your local font path directly

    async def process_pdf(self, file_path, output_path="translated_output.pdf"):
        """Processes a PDF: OCR, summarize, translate & generate translated PDF."""
        print("📂 Extracting text from PDF...")
        self.pdf_text = self.extract_text(file_path)

        print("🧹 Cleaning & preparing text...")
        self.clean_and_prepare_text()

        print("📝 Summarizing text...")
        self.summarize_text()

        print("🌍 Translating summary to English...")
        translated_summary = await self.translate_to_english(self.summary)

        print("📑 Generating translated PDF...")
        self.generate_translated_pdf(translated_summary, output_path)

        print(f"\n✅ Processing Complete. Translated PDF saved at: {output_path}")
        print("\n📄 Summary:\n", self.summary)
        print("\n🔑 Keywords:\n", self.words)
        print("\n🌍 English Summary:\n", translated_summary)

    def extract_text(self, file_path):
        """Extracts text from a PDF, uses OCR for scanned pages if needed."""
        pdf_reader = PyPDF2.PdfReader(file_path)
        self.num_pages = len(pdf_reader.pages)

        extracted_text = ""
        for page_num, page in enumerate(pdf_reader.pages, start=1):
            text = page.extract_text()
            if text:
                extracted_text += text
            else:
                try:
                    xobject = page.get("/Resources", {}).get("/XObject", {})
                    for obj in xobject:
                        img_data = xobject[obj].get_data()
                        image = Image.open(io.BytesIO(img_data))
                        extracted_text += pytesseract.image_to_string(image)
                except Exception as e:
                    print(f"⚠️ OCR skipped for page {page_num}: {e}")
        return extracted_text

    def clean_and_prepare_text(self):
        """Cleans text and extracts keywords."""
        text = self.pdf_text.lower()
        text = re.sub(r"[^\w\s]", "", text)
        self.words = set(text.split())

    def summarize_text(self):
        """Generates summary using TextRank with fallback."""
        self.summary = summa.summarizer.summarize(self.pdf_text, ratio=0.2)
        if not self.summary:
            print("⚠️ No summary generated, using first 1000 characters as fallback.")
            self.summary = self.pdf_text[:1000]

    async def translate_to_english(self, text):
        """Detects language and translates text into English in chunks."""
        try:
            detected_lang = detect(text)
            print(f"🌐 Detected Language: {detected_lang}")
        except Exception:
            detected_lang = "unknown"
            print("⚠️ Could not detect language, defaulting to English translation.")

        if detected_lang == "en":
            return text

        chunks = [text[i:i+500] for i in range(0, len(text), 500)]
        translated_chunks = []
        for chunk in chunks:
            translation = await asyncio.to_thread(self.translator.translate, chunk, dest="en")
            translated_chunks.append(translation.text)

        return "".join(translated_chunks)

    def break_long_strings(self, text, max_length=50):
        """Inserts spaces into very long strings to aid word wrapping."""
        words = text.split()
        broken_words = []
        for word in words:
            if len(word) > max_length:
                broken_word = ' '.join(word[i:i+max_length] for i in range(0, len(word), max_length))
                broken_words.append(broken_word)
            else:
                broken_words.append(word)
        return ' '.join(broken_words)

    def generate_translated_pdf(self, translated_text, output_path):
        """Creates a PDF with full Unicode support using fpdf2."""
        pdf = FPDF()
        pdf.add_page()

        if os.path.exists(self.font_path):
            try:
                pdf.add_font("DejaVu", style="", fname=self.font_path, uni=True)
                pdf.set_font("DejaVu", size=12)
                print(f"✅ Using Unicode font: {self.font_path}")
            except Exception as e:
                print(f"⚠️ Could not load DejaVu font: {e}, falling back to Helvetica.")
                pdf.set_font("Helvetica", size=12)
        else:
            print("⚠️ Font file not found, using default Helvetica (may break Unicode).")
            pdf.set_font("Helvetica", size=12)

        processed_text = self.break_long_strings(translated_text)

        for line in processed_text.split("\n"):
            pdf.multi_cell(0, 10, line)

        pdf.output(output_path)


async def main():
    processor = MultilingualPDFProcessor(font_path="/content/DejaVuSans.ttf")  # ✅ update path if needed
    await processor.process_pdf('/content/uploaded_pdf.pdf')


if __name__ == "__main__":
    await main()


📂 Extracting text from PDF...
🧹 Cleaning & preparing text...
📝 Summarizing text...
🌍 Translating summary to English...
🌐 Detected Language: mr
📑 Generating translated PDF...
✅ Using Unicode font: /content/DejaVuSans.ttf

✅ Processing Complete. Translated PDF saved at: translated_output.pdf

📄 Summary:
 "एक फाऱ झाडारा रटकत होते , आता फोरा.” 
“आं आं!” 
टांगरे ,” आजीने वलचायरे.
इंडडमन रोकांच्मा हाती रागरा नाही .” 
जाईन.” आजी ऩुढे गेरी भग ऩयत 
मेतोम ,” आजी म्हणारी.
"ठीक आहे," आजी म्हणारी .
एलढं भोठं कुत्रं,” आजी म्हणारी.
आता मा फाऱारा ऩण भदत कय,” आजी म्हणारी.
फंद कय,” आजी म्हणारी.
“चर, एक चांगरा कुत्रा हो,” आजी म्हणारी.
आहे,” आजी म्हणारी.
दठकाणालय मेळीर ,” भग आजीने त्मा 
ठीक आहे ,” आजी म्हणारी.
कळी घेलून जाऊ?” आजीने वलचाय केरा आणण 
करू ळकते .” याखाडी यंगाच्मा प्राण्मासाठी ततने थोडं 
फाऱासाठी ततने दूध आणरं “हे फघा,” 
”फघ रहान फाऱ झोऩरं सुद्धा,” आजी म्हणारी.
“फास!” ती 
“गुयर,” ऐडा म्हणारा.
“ळांत फस!” आजी म्हणारी .
“आं!” फाऱ म्हणारे.
“हे फघ तू काम केरं ते,” आजी म्हणारी.
आहे, जे केरं ते केरं,”

  pdf.add_font("DejaVu", style="", fname=self.font_path, uni=True)
