In [None]:

!pip install pytesseract pymupdf pdf2image gTTS gradio num2fawords
!apt install tesseract-ocr -y
!apt install poppler-utils -y
!wget -O /usr/share/tesseract-ocr/4.00/tessdata/fas.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/fas.traineddata


In [None]:

import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from gtts import gTTS
import gradio as gr
import re
import tempfile
from num2fawords import words

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


In [None]:

def clean_text(text):
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'[ \t]{2,}', ' ', text)
    return text.strip()

def convert_numbers_to_words(text):
    return re.sub(r'\d+', lambda m: words(int(m.group())), text)

def extract_text_from_pdf(pdf_file):
    text = ""
    try:
        with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
            for page in doc:
                text += page.get_text()
        if text.strip():
            return clean_text(text)
    except Exception as e:
        print(f"متن مستقیم استخراج نشد: {e}")

    try:
        pdf_file.seek(0)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
            tmp_pdf.write(pdf_file.read())
            tmp_pdf_path = tmp_pdf.name
        images = convert_from_path(tmp_pdf_path)
        for img in images:
            text += pytesseract.image_to_string(img, lang='fas')
        return clean_text(text)
    except Exception as e:
        return "خطا در OCR: " + str(e)

def summarize_text(text):
    lines = text.strip().split('\n')
    summary = '\n'.join(lines[:10]) if len(lines) > 10 else text
    return f"🔍 خلاصه:\n\n{summary}"

def generate_voice(text):
    text = convert_numbers_to_words(text)
    tts = gTTS(text, lang='fa')
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        return fp.name


In [None]:

def process_pdf(file):
    if file is None:
        return "لطفاً یک فایل PDF وارد کنید", None, None
    text = extract_text_from_pdf(file)
    summary = summarize_text(text)
    voice_path = generate_voice(text)
    return text, summary, voice_path

iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="بارگذاری فایل PDF"),
    outputs=[
        gr.Textbox(label="متن استخراج‌شده", lines=10),
        gr.Textbox(label="خلاصه متن", lines=5),
        gr.Audio(label="خواندن با صدا")
    ],
    title="🤖 ربات معلم PDF - نسخه آنلاین",
    description="فایل PDF خود را بارگذاری کنید تا متن، خلاصه و صدا دریافت کنید (پشتیبانی از فارسی + OCR)",
    theme="default"
)

iface.launch()
