<a href="https://colab.research.google.com/github/alisonnnnn88/programming_language/blob/main/HW6_AI_%E5%AD%B8%E7%BF%92%E7%AD%86%E8%A8%98%E7%B3%BB%E7%B5%B1(Gradio%E7%89%88%E6%9C%AC).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -------------------------------
# 安裝必要套件 (Colab 執行一次)
# -------------------------------
!apt install tesseract-ocr -y
!pip install pdfplumber pillow google-auth gspread google-auth-oauthlib google-auth-httplib2 google-generativeai gradio -q

# -------------------------------
# 匯入模組
# -------------------------------
import pdfplumber
import google.generativeai as genai
import gspread
from google.colab import auth
from google.auth import default
from datetime import datetime
import os
import gradio as gr

# -------------------------------
# Google Sheet 認證
# -------------------------------
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1h75idXVO6GMosL5bPUF-G76MqxOziHu1tJieRtpdkP4/edit?usp=sharing"
sh = gc.open_by_url(SPREADSHEET_URL)
worksheet_name = "學習筆記"
try:
    ws = sh.worksheet(worksheet_name)
except gspread.exceptions.WorksheetNotFound:
    ws = sh.add_worksheet(title=worksheet_name, rows=100, cols=20)

# -------------------------------
# PDF 文字抓取 (單頁純文字)
# -------------------------------
def extract_text_single_page(pdf_path, page_num=1):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages = pdf.pages
            if page_num < 1 or page_num > len(pages):
                return f"⚠️ PDF 總共有 {len(pages)} 頁，輸入頁數超出範圍"
            text = pages[page_num-1].extract_text()
            if not text:
                return "⚠️ 該頁無可抓取文字 (可能是掃描 PDF)"
            return text.strip()
    except Exception as e:
        return f"❌ PDF 擷取失敗：{e}"

# -------------------------------
# AI 分段生成摘要 + 題目 (極速版，帶進度)
# -------------------------------
import time
def generate_summary_and_quiz_fast(text, model_name="gemini-2.0-flash-exp"):
    try:
        # callback 內初始化 API Key
        os.environ['GEMINI_API_KEY'] = "AIzaSyBI639hWVMyMSd4K2dZXVXHftT4sWggNBk"
        genai.configure(api_key=os.environ["GEMINI_API_KEY"])
        model = genai.GenerativeModel(model_name)

        # 分段生成摘要，每段 300 字
        chunk_size = 300
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        final_summary = ""

        # 使用 generator 回傳進度
        for idx, chunk in enumerate(chunks, 1):
            yield f"正在生成第 {idx}/{len(chunks)} 段摘要…", "", ""
            summary_prompt = f"請整理以下內容的重點摘要（條列3-5點）:\n\n{chunk}"
            resp = model.generate_content(summary_prompt)
            part_summary = resp.candidates[0].content.parts[0].text
            final_summary += part_summary.strip() + "\n"

            time.sleep(2)

        # 生成題目僅用前 200 字
        quiz_prompt = f"根據以下內容出3題簡答題（附答案）:\n\n{text[:200]}"
        quiz_resp = model.generate_content(quiz_prompt)
        final_quiz = quiz_resp.candidates[0].content.parts[0].text.strip()

        # 完成回傳
        yield text[:500], final_summary.strip(), final_quiz

    except Exception as e:
        yield text[:500], f"❌ 生成摘要失敗：{e}", f"❌ 生成題目失敗：{e}"

# -------------------------------
# Gradio 主程式
# -------------------------------
def process_pdf_fast(pdf_file, page_num_str, api_key):
    if not pdf_file:
        yield "⚠️ 請上傳 PDF", "", ""
        return

    if not api_key:
        yield "⚠️ 請輸入 API Key", "", ""
        return

    try:
        page_num = int(page_num_str)
    except:
        yield "⚠️ 頁數輸入錯誤", "", ""
        return

    # 設定 API Key
    import os
    os.environ['GEMINI_API_KEY'] = api_key
    genai.configure(api_key=api_key)

    pdf_path = pdf_file.name
    text = extract_text_single_page(pdf_path, page_num)
    if text.startswith("⚠️") or text.startswith("❌"):
        yield text, "", ""
        return

    # 將生成摘要過程放進 generator
    yield from generate_summary_and_quiz_fast(text)

    # 寫入 Google Sheet
    from datetime import datetime
    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    ws.append_row([now_str, pdf_path, text[:500], "生成完成"])


# -------------------------------
# Gradio 介面
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## PDF 單頁極速摘要 & 題目生成器 (帶進度回饋)")

    with gr.Row():
        pdf_input = gr.File(label="上傳 PDF", file_types=[".pdf"])
        page_input = gr.Textbox(label="頁碼", placeholder="輸入要抓取的頁碼，如 1")
        api_input = gr.Textbox(label="Gemini API Key", type="password", placeholder="請輸入你的 API Key")

    with gr.Row():
        text_output = gr.Textbox(label="抓到的文字前500字")
        summary_output = gr.Textbox(label="摘要")
        quiz_output = gr.Textbox(label="題目")

    run_btn = gr.Button("生成摘要與題目")
    run_btn.click(
        process_pdf_fast,
        inputs=[pdf_input, page_input, api_input],
        outputs=[text_output, summary_output, quiz_output]
    )

demo.launch()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hIt looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False`

