In [87]:
import mechanize
import json
import csv
import urllib.parse
from http.cookiejar import LWPCookieJar
from tqdm import tqdm

In [88]:
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
cj = LWPCookieJar()
br.set_cookiejar(cj)

In [89]:
start_url = 'https://www.rfa.org/mandarin/shishi-hecha/story_archive/'
try:
    br.open(start_url)
except Exception as e:
    print(f"Error visiting start page: {e}")


In [168]:
# LANG = 'zh'
LANG = 'en'

In [169]:
if LANG == 'zh':
    d = 109
    rfa_website = 'rfa-mandarin'
    includeSections = '/mandarin/shishi-hecha'
elif LANG == 'en':
    d = 111
    rfa_website = 'radio-free-asia'
    includeSections = '/english/factcheck'

In [170]:
output_csv_file = f'rfa_shishi_data_{LANG}.csv'
output_json_file = f'rfa_shishi_data_{LANG}.json'

In [171]:
results = []

In [172]:
filter_param = urllib.parse.quote(
    '{content_elements{_id,credits{by{additional_properties{original{byline}},name,type,url}},description{basic},display_date,headlines{basic},label{basic{display,text,url}},owner{sponsored},promo_items{basic{_id,auth{1},type,url},lead_art{promo_items{basic{_id,auth{1},type,url}},type}},type,websites{' + rfa_website + '{website_section{_id,name},website_url}}},count,next}'
)

In [173]:
# query_obj = {
#         'feature': 'results-list',
#         'feedOffset': 0,
#         'feedSize': 100,
#         'includeSections': '/mandarin/shishi-hecha',
#         'query': 'display_date:[2024-01-01 TO 2025-12-31]'
#     }

In [174]:
# query_str = urllib.parse.quote(json.dumps(query_obj), safe='')
# api_url = (
#     f"https://www.rfa.org/pf/api/v3/content/fetch/story-feed-sections?"
#     f"query={query_str}&filter={filter_param}&d=109&_website=rfa-mandarin"
# )

In [175]:
# api_url

In [176]:
# resp = br.open(api_url)
# data = json.loads(resp.read().decode('utf-8'))

In [177]:
# elem = data.get('content_elements', [])[1]

In [178]:
elem

{'_id': 'J73WFX6QI73X24CZL374UC3CEE',
 'credits': {'by': [{'additional_properties': {'original': {}},
    'name': '作者：辛西娅',
    'type': 'author'}]},
 'description': {'basic': '11月3日，拥有644万粉丝的新浪大V “孤烟暮蝉”在其个人微博账号发布贴文，称乌克兰军队向北约组织(NATO)要求6万多升血液，以救助伤兵。北约捐赠的罐装血液送达以后，乌克兰方面抽查时发现部分血液感染了艾滋病毒、乙型和丙型肝炎病毒。乌方要求北约对血液状况进行独立评估，并要求不要在非洲大陆购买血液。贴文配有三张图片，第一张图片以英文宣称该信息来自乌克兰总理被黑客破解的电邮，第二张是貌似乌克兰语写成的公函，第三张是该公函的英文翻译。图片水印显示“mash”字样。'},
 'display_date': '2022-11-15T21:29:00Z',
 'headlines': {'basic': '事实查核  | 北约捐给乌克兰带病毒血液？乌卫生部驳斥微博大V谣言'},
 'owner': {'sponsored': False},
 'promo_items': {'basic': {'_id': 'YI2NISCEHCBNXVE3JQYZVQJ4CM',
   'auth': {'1': '5cf729e64c4302408f7a70d505f6141d38588c16d9e2a92b6e82ea2db7722011'},
   'type': 'image',
   'url': 'https://cloudfront-us-east-1.images.arcpublishing.com/radiofreeasia/YI2NISCEHCBNXVE3JQYZVQJ4CM.png'}},
 'type': 'story',
 'websites': {'rfa-mandarin': {'website_section': {'_id': '/mandarin/shishi-hecha',
    'name': '事实查核 '},
   'website_url': '/mandarin/shishi-hecha/hc-

In [179]:
# website = elem.get('websites', {}).get('rfa-mandarin', {})

In [180]:
# url = 'https://www.rfa.org' + website.get('website_url', '')
# url

In [181]:
# category = website.get('website_section', {}).get('name', '')
# category

In [182]:
# image_url = elem.get('promo_items', {}).get('basic', {}).get('url', '')
# image_url

In [190]:
results = []

In [191]:
# 逐步調整 feedOffset, 共抓取 0,100,...,500
for offset in tqdm(range(0, 600, 100), desc='Fetching pages'):
    # 建構 query 物件並 URL 編碼
    query_obj = {
        'feature': 'results-list',
        'feedOffset': offset,
        'feedSize': 100,
        'includeSections': includeSections,
        'query': 'display_date:[2024-01-01 TO 2025-12-31]'
    }
    query_str = urllib.parse.quote(json.dumps(query_obj), safe='')
    api_url = (
        f"https://www.rfa.org/pf/api/v3/content/fetch/story-feed-sections?"
        f"query={query_str}&filter={filter_param}&d={d}&_website={rfa_website}"
    )
    # print(api_url)

    try:
        resp = br.open(api_url)
        data = json.loads(resp.read().decode('utf-8'))
    except Exception as e:
        print(f"Error fetching offset {offset}: {e}")
        continue

    for elem in data.get('content_elements', []):
        # print(elem)
        try:
            # 提取作者 (去除「作者：」前綴)
            by_list = elem.get('credits', {}).get('by', [])
            authors = [b.get('name', '').replace('作者：', '').strip() for b in by_list]
            author = '、'.join(authors)

            description = elem.get('description', {}).get('basic', '').strip()
            title = elem.get('headlines', {}).get('basic', '').strip()
            image_url = elem.get('promo_items', {}).get('basic', {}).get('url', '')
            display_date = elem.get('display_date', '')
            date = display_date.split('T')[0] if 'T' in display_date else display_date
            website = elem.get('websites', {}).get(rfa_website, {})
            url = 'https://www.rfa.org' + website.get('website_url', '')
            category = website.get('website_section', {}).get('name', '').strip()

            results.append({
                'date': date,
                'category': category,
                'title': title,
                'author': author,
                'description': description,
                'url': url,
                'image_url': image_url
            })
        except Exception as e:
            print(f"Error parsing element {elem.get('_id')}: {e}")


Fetching pages: 100%|████████████████████████████████████████████████| 6/6 [00:00<00:00, 19.10it/s]


In [192]:
try:
    with open(output_json_file, 'w', encoding='utf-8') as jf:
        json.dump(results, jf, ensure_ascii=False, indent=2)
    print(f"Saved JSON to {output_json_file}")
except Exception as e:
    print(f"Error saving JSON: {e}")

Saved JSON to rfa_shishi_data_en.json


In [193]:
try:
    with open(output_csv_file, 'w', encoding='utf-8', newline='') as cf:
        writer = csv.DictWriter(
            cf,
            fieldnames=['date', 'category', 'title', 'author', 'description', 'url', 'image_url']
        )
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    print(f"Saved CSV to {output_csv_file}")
except Exception as e:
    print(f"Error saving CSV: {e}")


Saved CSV to rfa_shishi_data_en.csv


SyntaxError: invalid decimal literal (1659398009.py, line 1)