In [17]:
import pdfplumber
import json_repair
import pandas as pd
def extract_text_from_pdf(pdf_path):
    txt = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            txt += page.extract_text()
    return txt

In [18]:
from gradio_client import Client

client = Client("Qwen/Qwen2-72B-Instruct")

def chatbot(prompt,input_text):
    result = client.predict(
        query=input_text,
        history=[],
        system=prompt,
        api_name="/model_chat"
    )
    return result[1][0][1]


Loaded as API: https://qwen-qwen2-72b-instruct.hf.space ✔


In [19]:
prompt = '''请帮我从下面这篇周报中总结作者读的论文。其中write_date为date_range的最后一天,除了论文标题和期刊名称用英文外，总结用中文。返回格式：
```json
{
    date_range: "yyyy.mm.dd-yyyy.mm.dd",
    write_date: "yyyy-mm-dd",
    papers: [
        {
            "title": "论文名称",
            "journal": "期刊名称",
            "content": "一句话总结",
            "conclusion": "一句话总结",
            "gain": "一句话总结"
        },
        {
            "title": "论文名称",
            "journal": "期刊名称",
            "content": "一句话总结",
            "conclusion": "一句话总结",
            "gain": "一句话总结"
        }
    ]
}
```
'''

In [28]:
def summarize_weekly(result,image_files):
    paperstr = pd.DataFrame(result['papers']).to_markdown(index=False)

    mdstr = f'''---
title: '周小结({result["date_range"]})'
date: {result["write_date"]}
permalink: /posts/{result["write_date"]}_week/
---
{paperstr}


''' + '\n'.join([f'![image](/{image})' for image in image_files])
    return mdstr

In [29]:
from glob import glob
import os
import shutil
import json
from pdf2image import convert_from_path

for pdf in glob("files/post1/*.pdf"):
    text = extract_text_from_pdf(pdf)
    result = chatbot(prompt,text)
    result = json_repair.loads(result)
    new_name = "files/post/" + result["write_date"] + "-week.pdf"
    os.rename(pdf,new_name)
    images = convert_from_path(new_name,dpi=200)
    image_files = []
    for i,image in enumerate(images):
        os.makedirs("files/post/" + result["write_date"] + "-week",exist_ok=True)
        image_path = "files/post/" + result["write_date"] + "-week/" + str(i) + ".jpg"
        image.save(image_path,'JPEG', quality=95)
        image_files.append(image_path)
    summarize_path = "_posts/" + result["write_date"] + "-week.md"
    summarize = summarize_weekly(result,image_files)
    with open(summarize_path,"w") as f:
        f.write(summarize)



# 将PDF转为图片格式

In [13]:

import glob

In [15]:
for pdf in glob.glob("files/post/*.pdf"):
    images = convert_from_path(pdf, dpi=160)
    basename = os.path.basename(pdf).replace(".pdf","")
    for i, image in enumerate(images):
        os.mkdir(f'files/post/{basename}') if not os.path.exists(f'files/post/{basename}') else None
        image.save(f'files/post/{basename}/{i}.jpg', 'JPEG', quality=95)