In [20]:
import pandas as pd
URL_CSV = "../data/GVFC_extension_multimodal.csv"
df_urls = pd.read_csv(URL_CSV, usecols=["id", "article_url", "headline"])

# list of dicts
URLS = df_urls.to_dict("records")


In [21]:
import aiohttp, asyncio, async_timeout, random, os, aiofiles, trafilatura
from tqdm.asyncio import tqdm_asyncio

UA = "Mozilla/5.0 (ResearchProject/2.0 +https://your‑email)"
HEADERS = {"User-Agent": UA}
SAVE_HTML = "../data/raw_html/{id}.html"
SAVE_TXT  = "../data/full_text/{id}.txt"
LOG_CSV   = "../data/scrape_log.csv"

In [22]:
async def fetch_html(session, url, timeout=15):
    try:
        async with async_timeout.timeout(timeout):
            async with session.get(url, headers=HEADERS) as r:
                if r.status != 200:
                    return None, r.status
                return await r.text(), r.status
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None, str(e)

def extract_text(html, url):
    txt = trafilatura.extract(html, url=url, include_comments=False)
    if txt and len(txt.split()) > 100:     # at least ~100 words
        return txt
    # fallback to newspaper3k
    from newspaper import Article
    art = Article(url)
    art.set_html(html); art.parse()
    return art.text if len(art.text.split()) > 100 else None


In [23]:
async def scrape_article(session, rec):
    art_id, url = rec["id"], rec["article_url"]
    txt_path = SAVE_TXT.format(id=art_id)
    if os.path.exists(txt_path):           # already scraped
        return "cached"

    html, status = await fetch_html(session, url)
    if not html or status != 200:
        print("failed to fetch", url, status)
        return ("fail_html", status)

    # optional cache html
    async with aiofiles.open(SAVE_HTML.format(id=art_id), "w") as f:
        await f.write(html)

    text = extract_text(html, url)
    if not text:
        return ("fail_extract", status)

    async with aiofiles.open(txt_path, "w") as f:
        await f.write(text)
    return "ok"

async def scrape_all(url_records):
    async with aiohttp.ClientSession() as session:
        results = []
        for rec in tqdm_asyncio(url_records, total=len(url_records)):
            res = await scrape_article(session, rec)
            results.append((rec["id"], rec["article_url"], res))
            # politeness: 0.5‑1.5 s gap
            await asyncio.sleep(random.uniform(0.5, 1.5))
        return results

scrape_results = await scrape_all(URLS[:10])
pd.DataFrame(scrape_results,
             columns=["id", "url", "status"]
            ).to_csv(LOG_CSV, index=False)


  0%|          | 0/10 [00:00<?, ?it/s]

failed to fetch http://beta.latimes.com/nation/la-na-paddock-cremated-20180118-story.html 400


 10%|█         | 1/10 [00:01<00:15,  1.75s/it]

failed to fetch https://www.yahoo.com/news/florida-shooter-troubled-loner-white-supremacist-ties-213321497.html 404


 20%|██        | 2/10 [00:03<00:15,  1.88s/it]

failed to fetch http://www.chicagotribune.com/suburbs/lake-county-news-sun/news/ct-lns-vernon-hills-alleged-supremacy-weapons-st-0519-story.html 404


 30%|███       | 3/10 [00:05<00:13,  1.86s/it]

failed to fetch http://www.chicagotribune.com/suburbs/post-tribune/news/ct-ptb-griffith-charges-high-school-gun-st-0223-20180222-story.html 404


 50%|█████     | 5/10 [00:09<00:10,  2.09s/it]

failed to fetch https://nz.news.yahoo.com/attorneys-mexican-mans-us-charges-vindictive-172057732.html 404


 90%|█████████ | 9/10 [00:19<00:02,  2.70s/it]

failed to fetch http://www.chicagotribune.com/ct-florida-school-shooter-nikolas-cruz-20180217-story.html 404


100%|██████████| 10/10 [00:20<00:00,  2.02s/it]
