In [None]:
import asyncio
from pathlib import Path

import httpx
from lxml import etree


async def get_book_articles(book_id: str):
    """
    异步获取扇贝阅读文章的API数据

    Returns:
        dict: 包含API响应数据的字典
    """
    url = f"https://apiv3.shanbay.com/reading/admin/books/{book_id}/articles"
    params = {"list_all": "true"}
    headers = {
        "accept": "application/json",
        "Cookie": Path("cookies.txt").read_text().strip(),
    }

    async with httpx.AsyncClient() as client:
        response = await client.get(
            url=url,
            params=params,
            headers=headers,
            timeout=10.0,  # 设置10秒超时
        )
        response.raise_for_status()  # 如果响应状态码不是200，抛出异常
        response_data = response.json()
        return [i["id"] for i in response_data["objects"]]


async def get_article_paragraphs(article_id: str):
    """
    异步获取文章段落的API数据(curl -X GET "https://apiv3.shanbay.com/reading/admin/articles/{article_id}/paragraphs" -H "accept: application/json")

    Returns:
        dict: 包含API响应数据的字典
    """
    url = f"https://apiv3.shanbay.com/reading/admin/articles/{article_id}/paragraphs"
    params = {"list_all": "true"}
    headers = {
        "accept": "application/json",
        "Cookie": Path("cookies.txt").read_text().strip(),
    }
    async with httpx.AsyncClient() as client:
        response = await client.get(
            url=url,
            params=params,
            headers=headers,
            timeout=10.0,  # 设置10秒超时
        )
        response.raise_for_status()  # 如果响应状态码不是200，抛出异常
        response_data = response.json()
        return [i["content"] for i in response_data["objects"]]


def parse_paragraphs_to_sentences(paragraphs_raw: str):
    """
    解析段落原字符串

    Args:
        paragraphs (list): 段落列表，每个元素为一个段落字符串

    Returns:
        list: 合并后的段落列表
    """
    doc = etree.fromstring(paragraphs_raw)
    sentences = doc.xpath("//sent")
    return [i.text for i in sentences]


async def get_all_sentences_from_book(book_id: str):
    """
    异步获取书籍所有段落的API数据

    Returns:
        dict: 包含API响应数据的字典
    """
    article_ids = await get_book_articles(book_id=book_id)
    # for article_id in article_ids:
    #     paragraphs = await get_article_paragraphs(article_id=article_id)
    #     for paragraph in paragraphs:
    #         sentences = parse_paragraphs_to_sentences(paragraphs_raw=paragraph)
    #         for sentence in sentences:
    #             print(sentence)
    # paragraphs = await asyncio.gather(
    #     *[get_article_paragraphs(article_id) for article_id in article_ids]
    # )
    tasks = []
    for article_id in article_ids:
        tasks.append(asyncio.create_task(get_article_paragraphs(article_id=article_id)))
        await asyncio.sleep(0.3)
    paragraphs = await asyncio.gather(*tasks)
    paragraphs_unzipped = []
    for paragraph in paragraphs:
        paragraphs_unzipped.extend(paragraph)
    sentences = []
    for paragraph in paragraphs_unzipped:
        sentences.extend(parse_paragraphs_to_sentences(paragraphs_raw=paragraph))
    return sentences


sentences = await get_all_sentences_from_book("gjozx")
for i in sentences:
    print(i)


In [None]:
import time

import pandas as pd

books = pd.read_csv("books.csv")
for book in books.itertuples():
    book_path = Path(f"book_sentence/{book.ID}-{book.中文}.txt")
    if book_path.exists():
        continue
    print(book.ID, book.中文)
    sentences = await get_all_sentences_from_book(book.ID)
    sentences = "\n".join(sentences)
    book_path.write_text(data=sentences)
    time.sleep(0.5)