In [239]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import markdownify
from tqdm import tqdm
from opencc import OpenCC
import re
from random import randint
from time import sleep

# 初始化 OpenCC 簡體轉繁體實例
cc = OpenCC('s2t')  # s2t: Simplified Chinese to Traditional Chinese conversion

In [240]:
# 預設輸出資料夾
DIST_DIR = 'dist'
LANG = 'zh'
IMAGES_DIR = os.path.join(DIST_DIR, LANG, 'images')

In [241]:
os.makedirs(DIST_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)

In [242]:
csv_path = f'rfa_shishi_data_{LANG}.csv'

In [243]:
# 全局URL到檔名映射字典
url_to_md = {}

In [244]:
def download_and_convert(url: str, replace=False):
    # 1. 取得對應 Markdown 檔名
    if url not in url_to_md:
        print(f"URL 未定義檔名對應：{url}")
        return None
    md_filename = url_to_md[url]
    output_md = os.path.join(DIST_DIR, LANG, md_filename)
    if not replace:
        if os.path.exists(output_md):
            return md_filename

    # 2. 建立 images 資料夾
    os.makedirs(IMAGES_DIR, exist_ok=True)

    # 3. 下載並解析網頁
    resp = requests.get(url)
    resp.encoding = resp.apparent_encoding
    soup = BeautifulSoup(resp.text, 'html.parser')

    # 4. 移除 header 與 nav
    if soup.header:
        soup.header.decompose()
    for nav in soup.find_all('nav'):
        nav.decompose()

    # 5. 處理 <img> 標籤並下載圖片（若不存在）
    for img in soup.find_all('img'):
        src = img.get('src') or ''
        img_url_full = urljoin(url, src)
        filename = os.path.basename(urlparse(img_url_full).path)
        if not filename:
            continue
        local_path = os.path.join(IMAGES_DIR, filename)
        if not os.path.exists(local_path):
            try:
                r = requests.get(img_url_full, stream=True)
                r.raise_for_status()
                with open(local_path, 'wb') as f:
                    for chunk in r.iter_content(8192):
                        f.write(chunk)
            except Exception as e:
                print(f"下載失敗：{img_url_full} → {e}")
        img['src'] = os.path.join('images', filename)

    # 6. HTML → Markdown
    md = markdownify.markdownify(str(soup), heading_style='ATX')

    # 7. 簡體轉繁體
    md = cc.convert(md)

    # 8. 刪除第一個標題前內容
    lines = md.splitlines()
    start_idx = next((i for i,l in enumerate(lines) if l.startswith('# ')), 0)
    lines = lines[start_idx:]

    # 9. 刪除「## 更多」及後續內容
    cleaned = []
    for l in lines:
        if re.match(r'^##\s*更多', l) or re.match(r'^##\s*MORE\s+Asia\sFact\sCheck\sLab', l):
            break
        cleaned.append(l)


    # 10. 替換相對/絕對內部連結
    def link_repl(m):
        text, link = m.group(1), m.group(2)
        if link.startswith('/'):
            full = 'https://www.rfa.org' + link
        else:
            full = link
        if full in url_to_md:
            return f"[{text}]({url_to_md[full]})"
        return m.group(0)

    processed = []
    counter = 1
    for raw in cleaned:
        line = raw.strip()
        # 處理 #### 標題 (含或不含粗體)
        m_h = re.match(r'^####?\s*(?:\*\*)?(.+?)(?:\*\*)?\s*$', line)
        if m_h:
            title_text = m_h.group(1).strip()
            processed.append(f"## {title_text}")
            continue
        # 處理編號清單轉標題
        m_num = re.match(r'^\d+\.\s*\*\*(.+?)\*\*$', line)
        if m_num:
            text = m_num.group(1).strip()
            processed.append(f"## {counter}. {text}")
            counter += 1
            continue
        # 處理整行連結替換
        line = re.sub(r'\[([^\]]+)\]\(((?:https?://www\.rfa\.org)?/[^\)]+|https?://www\.rfa\.org[^\)]+)\)', link_repl, line)
        # 處理整行純 Bold 作為次標題
        if line.startswith('**') and line.endswith('**'):
            inner = re.sub(r'\*\*', '', line).strip()
            inner = inner.replace(':', '：')
            inner = inner.strip()
            processed.append(f"## {inner}")
            continue
        if line.startswith('#'):
            line = line.replace('*', '')
        processed.append(line)

    # 11. 每行 strip 並重組
    final_lines = [l.strip() for l in processed]
    final_lines.append("")
    final_lines.append(f"[Original Source]({url})")
    final_md = '\n'.join(final_lines)

    # 12. 寫檔
    with open(output_md, 'w', encoding='utf-8') as f:
        f.write(final_md)
    return md_filename

In [245]:
df = pd.read_csv(csv_path)

In [246]:
required = {'url', 'date', 'title'}
if not required.issubset(df.columns):
    raise ValueError("CSV 必須包含 `url`, `date`, `title` 欄位，可選 `image_url`")

In [247]:
# 1. 建立映射
for _, row in df.iterrows():
    url = row['url']
    date = row['date']
    title_tw = cc.convert(row['title'])
    safe_title = title_tw.replace('/', '_').replace(':', '：').replace('|', '｜')
    md_fn = f"{date}_{safe_title}.md"
    url_to_md[url] = md_fn

In [248]:
entries = []
# 2. 處理每筆 URL
for _, row in tqdm(df.iterrows(), total=len(df), desc='備份進度'):
    try:
        md_file = download_and_convert(row['url'], replace=True)
        if md_file:
            entries.append({'date': row['date'], 'title': cc.convert(row['title']), 'md_file': md_file})
    except Exception as e:
        print(f"[錯誤] {row['url']} → {e}")

備份進度:  30%|███████████████▏                                  | 167/551 [18:35<37:45,  5.90s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88682.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88682.png/@@images/image/social_media
下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88683.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88683.png/@@images/image/social_media
下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88684.png/@@images/image/social_media → 503 Server Error: Service Unavailable fo

備份進度:  30%|███████████████▏                                  | 168/551 [18:45<46:04,  7.22s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88685.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-african-nations-lift-china-to-united-nations-fact-check-09202024135757.html/88685.png/@@images/image/social_media


備份進度:  58%|█████████████████████████████▏                    | 321/551 [35:48<30:26,  7.94s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-02232024183403.html/2.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-02232024183403.html/2.png/@@images/image/social_media


備份進度:  64%|███████████████████████████████▉                  | 352/551 [39:10<25:32,  7.70s/it]

[錯誤] https://www.rfa.org/mandarin/shishi-hecha/hc-12302023085958.html → ('Connection broken: IncompleteRead(0 bytes read, 139 more expected)', IncompleteRead(0 bytes read, 139 more expected))


備份進度:  76%|██████████████████████████████████████            | 420/551 [46:56<15:04,  6.91s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-09042023093849.html/65e5672c999676f854036d779bae6025900191ab-524d653f52d95b98559d6838865574066c346b7b4ea1-3.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-09042023093849.html/65e5672c999676f854036d779bae6025900191ab-524d653f52d95b98559d6838865574066c346b7b4ea1-3.png/@@images/image/social_media


備份進度:  93%|██████████████████████████████████████████████▎   | 511/551 [57:38<04:51,  7.30s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-03212023110025.html/622a5716-2023-03-21-14-25-12.png/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-03212023110025.html/622a5716-2023-03-21-14-25-12.png/@@images/image/social_media


備份進度: 100%|████████████████████████████████████████████████| 551/551 [1:02:17<00:00,  6.78s/it]

下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-11152022161312.html/1min-7c21.jpg/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-11152022161312.html/1min-7c21.jpg/@@images/image/social_media





In [271]:
num = 550
print(df.loc[num]['url'])
download_and_convert(df.loc[num]['url'], replace=True)

https://www.rfa.org/mandarin/shishi-hecha/hc-11152022161312.html
下載失敗：https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-11152022161312.html/1min-7c21.jpg/@@images/image/social_media → 503 Server Error: Service Unavailable for url: https://manstaging.rfaweb.org/mandarin/shishi-hecha/hc-11152022161312.html/1min-7c21.jpg/@@images/image/social_media


'2022-11-15_事實查覈  ｜ 北約捐給烏克蘭帶病毒血液？烏衛生部駁斥微博大V謠言.md'

In [250]:
# 3. 生成 content.md
content_path = os.path.join(DIST_DIR, LANG, 'content.md')
with open(content_path, 'w', encoding='utf-8') as f:
    for e in entries:
        disp = f"{e['date']}_{e['title'].replace(':', '：').replace('|', '｜')}"
        f.write(f"* [{disp}](<{e['md_file']}>)\n")
print(f"已生成 {content_path}")

已生成 dist/zh/content.md


In [251]:
df

Unnamed: 0,date,category,title,author,description,url,image_url
0,2025-05-08,事实查核,《中共外宣在台湾》报告全文电子档发布,亚洲事实查核实验室,AFCL将《中共外宣在台湾》十一篇报道重新编辑成电子书版本，全文共九章，读者可以自由下载。,https://www.rfa.org/mandarin/shishi-hecha/2025...,https://cloudfront-us-east-1.images.arcpublish...
1,2025-05-05,事实查核,事实查核｜网传视频是以色列大火画面？,庄敬,近日在X平台上，有「蓝勾」认证帐号发布一段山林、街区燃烧的短视频，宣称是「熊熊大火席卷以色列...,https://www.rfa.org/mandarin/shishi-hecha/2025...,https://cloudfront-us-east-1.images.arcpublish...
2,2025-05-02,事实查核,事实查核｜照片显示，胡塞武装又击落一架美军MQ9无人机？,董喆,近期，X平台与中国论坛上广传一张照片，称是胡塞武装又击落一架美军MQ-9无人机，但这是201...,https://www.rfa.org/mandarin/shishi-hecha/2025...,https://cloudfront-us-east-1.images.arcpublish...
3,2025-05-01,事实查核,事实查核｜网传视频显示，白宫墙上挂着「2028竞选红帽」销售图？,庄敬,微博、X等平台上流传一则短视频：美国总统特朗普记者展示白宫墙上挂着的「2028竞选红帽」销售...,https://www.rfa.org/mandarin/shishi-hecha/2025...,https://cloudfront-us-east-1.images.arcpublish...
4,2025-04-30,事实查核,事实快查｜关于国民党426游行，两则错假资讯流传中,董喆,2025年台湾民间团体发起「大罢免行动」，对象锁定30多位国民党籍民意代表。国民党也在4月2...,https://www.rfa.org/mandarin/shishi-hecha/2025...,https://cloudfront-us-east-1.images.arcpublish...
...,...,...,...,...,...,...,...
546,2022-12-06,事实查核,"事实查核 | 世卫专家真的说了""躺平者将双手沾满鲜血""吗？",艾玛,"经过各大城市爆发抗议极端防疫政策的""白纸运动""后，中国各地在新冠疫情的防控上开始传出不同程度...",https://www.rfa.org/mandarin/shishi-hecha/hc-1...,https://cloudfront-us-east-1.images.arcpublish...
547,2022-12-01,事实查核,"事实查核 | 新疆乌鲁木齐火灾与""动态清零""政策有关吗？ ——赵立坚说的""造谣抹黑""是真的吗？",艾玛,,https://www.rfa.org/mandarin/shishi-hecha/hc-1...,https://cloudfront-us-east-1.images.arcpublish...
548,2022-11-29,事实查核,为死难同胞讨公道，哥大女生遭不明身份男子暴打,记者：艾玛,"为纪念新疆乌鲁木齐""11.24""大火遇难同胞，声援中国各地爆发的""白纸运动""，海外中国留学生...",https://www.rfa.org/mandarin/shishi-hecha/hc-1...,https://cloudfront-us-east-1.images.arcpublish...
549,2022-11-22,事实查核,"事实查核 | 是谁在""利用人工智能操纵叙事""？ — 斯坦福大学报告揭示的""涉疆谣言""真相",艾玛,近日，央视旗下自媒体品牌“玉渊谭天”制作了一期风靡互联网的节目——《独家揭秘：美国如何利用人...,https://www.rfa.org/mandarin/shishi-hecha/hc-1...,https://cloudfront-us-east-1.images.arcpublish...
