In [6]:
import csv
import os
import re
import requests
from bs4 import BeautifulSoup


def remove_english_charaters(text):
    """
    Remove English characters from text.

    Args:
    text(str):Input text to clean.

    Returns:
    str: Cleaned text with only Chinese characters.
    """    
    cleaned_text = re.sub(r'[a-zA-Z\s\-:.,]+', '', text).strip()
    return cleaned_text


def check_chinese_characters(text):
    """
    Check if the text contains Chinese characters.

    Args:
    text(str):Input text to check.

    Returns:
    bool:True if text contains Chinese characters, False otherwise.
    """
    return bool(re.search(r'[\u4e00-\u9fff]', text))


def scrape_gutenberg_content(id_number):
    """
    Scrape content from Project Gutenberg book which written in Chinese.
    And save it as .txt file.

    Args:
    id_number:book ID number from Project Gutenberg.

    Returns:
    bool: True if successful, False otherwise.
    """
    url = (
        f"https://www.gutenberg.org/cache/epub/{id_number}/"
        f"pg{id_number}-images.html"
    )

    try:
        response = requests.get(url, timeout=10)
        
        #404 errors
        if response.status_code == 404:
            print(f"Error 404: Page not found for ID {id_number}")
            return False

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title and clean it
        title_element = soup.select_one('span#pg-title-no-subtitle')
        if not title_element:
            print(f"No title found for ID {id_number}")
            return False

        title = title_element.get_text(strip=True)
        cleaned_title = remove_english_charaters(title)

        # Skip if no Chinese characters in title
        if not check_chinese_characters(cleaned_title):
            print(f"No Chinese characters in title for ID {id_number}")
            return False

        #Extract content paragraphs
        paragraphs = []
        chinese_paragraph_count = 0
        for i in range(1, 100):
            paragraph_id = f'p#id{str(i).zfill(5)}'
            paragraph_element = soup.select_one(paragraph_id)
            
            #Break if no more paragraphs
            if not paragraph_element:
                break
            
            #Remove English characters
            text = paragraph_element.get_text(strip=False)
            text_without_english = re.sub(r'[a-zA-Z]', '', text)
            
            # 只保留包含中文字的段落
            if check_chinese_characters(text_without_english):
                paragraphs.append(text_without_english.strip())
                chinese_paragraph_count += 1      

        #Join paragraphs with double newline
        content = '\n\n'.join(paragraphs)

        #Ensure output directory exists
        os.makedirs('./project_gutenberg', exist_ok=True)

        #Save to txt file
        output_path = os.path.join('./project_gutenberg', f'{cleaned_title}.txt')
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"文件已成功保存: {output_path}")
        return True

    except requests.RequestException as e:
        print(f"Request error for ID {id_number}: {e}")
        return False
    except Exception as e:
        print(f"Unexpected error for ID {id_number}: {e}")
        return False


def process_csv(csv_path):
    """
    Read book ID from CSV and scrape content for each book.

    Args:
    csv_path(str):Path to the CSV containing book ID.
    """
    successful_downloads = 0
    total_attempts = 0

    with open(csv_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            id_number = row[0]
            total_attempts += 1
            
            if scrape_gutenberg_content(id_number):
                successful_downloads += 1

    print(f"\n下載總結:")
    print(f"總嘗試數: {total_attempts}")
    print(f"成功下載: {successful_downloads}")
    print(f"失敗數: {total_attempts - successful_downloads}")


def main():
    """Main function to execute the book scraping process."""
    csv_path = './book_ids.csv'
    process_csv(csv_path)


if __name__ == '__main__':
    main()

文件已成功保存: ./project_gutenberg\管子.txt
文件已成功保存: ./project_gutenberg\虬髯客傳.txt
文件已成功保存: ./project_gutenberg\俗話傾談.txt
文件已成功保存: ./project_gutenberg\菜根譚前後集.txt
文件已成功保存: ./project_gutenberg\六祖壇經.txt
文件已成功保存: ./project_gutenberg\幽明錄.txt
文件已成功保存: ./project_gutenberg\春柳鶯.txt
文件已成功保存: ./project_gutenberg\東度記.txt
文件已成功保存: ./project_gutenberg\楊家將演義.txt
文件已成功保存: ./project_gutenberg\花月痕.txt
文件已成功保存: ./project_gutenberg\韓詩外傳56.txt
文件已成功保存: ./project_gutenberg\宛如約.txt
文件已成功保存: ./project_gutenberg\中國十大禁書之國色天香.txt
文件已成功保存: ./project_gutenberg\斬鬼傳.txt
文件已成功保存: ./project_gutenberg\比目魚.txt
文件已成功保存: ./project_gutenberg\粉妝樓5160回.txt
文件已成功保存: ./project_gutenberg\司馬法.txt
文件已成功保存: ./project_gutenberg\白兔記.txt
文件已成功保存: ./project_gutenberg\墨子.txt
文件已成功保存: ./project_gutenberg\夢溪筆談1116.txt
文件已成功保存: ./project_gutenberg\狂人日記.txt
文件已成功保存: ./project_gutenberg\李太白集.txt
文件已成功保存: ./project_gutenberg\隋唐演義.txt
文件已成功保存: ./project_gutenberg\春秋繁露.txt
文件已成功保存: ./project_gutenberg\二十年目睹之怪現狀.txt
文件已成功保存: ./project_gutenberg\警世通言.txt
