In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime, timedelta

# User-Agent header to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

# Parameters
start_date = datetime(2019, 10, 1)
end_date = datetime(2025, 10, 1)
output_file = "bitcoin-articles-6y.csv"
max_articles_per_day = 30
keywords = [kw.lower() for kw in [
    "war", "conflict", "sanctions", "energy crisis", "opec", "russia", "ukraine",
"china crackdown", "trade war", "tariffs", "global uncertainty", "bank collapse", "price surge", "price drop", "market crash", "bull market", "bear market",
"volatility", "rally", "sell-off", "liquidity", "risk sentiment",
"asset bubble", "market correction", "trading volume", "institutional investors","bitcoin", "btc", "crypto", "cryptocurrency", "digital currency",
"ethereum", "altcoin", "stablecoin", "crypto market",
"crypto exchange", "blockchain", "decentralized finance","blackrock", "goldman sachs", "jpmorgan", "morgan stanley",
"spot etf", "bitcoin etf", "etf approval", "sec", "coinbase",
"binance", "crypto exchange", "venture capital", "startup funding",


]]

# Create CSV if not exists
try:
    pd.read_csv(output_file)
except FileNotFoundError:
    pd.DataFrame(columns=["date", "title"]).to_csv(output_file, index=False)

# Main loop
current_date = end_date
days_collected = 0

while current_date >= start_date:
    date_str = current_date.strftime("%Y-%m-%d")
    date_url_format = f"{current_date.year}/{current_date.month}/{current_date.day}"
    print(f" Scraping {date_str}...")
    filtered_articles = []
    page = 1

    while page <= 35:  # LIMIT pages per day
        url = f"https://markets.businessinsider.com/news/archive/{date_url_format}?p={page}"
        try:
            response = requests.get(url, headers=headers, timeout=7)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            rows = soup.select("table.table.table-small.no-margin-bottom tr")

            if not rows:
                break

            for row in rows:
                tag = row.find('a', title=True)
                if tag:
                    title = tag['title'].strip()
                    if any(kw in title.lower() for kw in keywords):
                        filtered_articles.append({"date": date_str, "title": title})
                        if len(filtered_articles) >= max_articles_per_day:
                            break

            if len(filtered_articles) >= max_articles_per_day:
                break

            page += 1
            time.sleep(random.uniform(0.15, 0.3))  # Faster but still polite

        except Exception as e:
            print(f"❌ Error fetching {url}: {e}")
            break

    if filtered_articles:
        pd.DataFrame(filtered_articles).to_csv(output_file, mode='a', header=False, index=False)
        print(f"✅ Saved {len(filtered_articles)} articles for {date_str}")
        days_collected += 1

    current_date -= timedelta(days=1)

print(f"Finished. Collected data for {days_collected} days.")

 Scraping 2025-10-01...
✅ Saved 30 articles for 2025-10-01
 Scraping 2025-09-30...
✅ Saved 30 articles for 2025-09-30
 Scraping 2025-09-29...
✅ Saved 30 articles for 2025-09-29
 Scraping 2025-09-28...
✅ Saved 30 articles for 2025-09-28
 Scraping 2025-09-27...
✅ Saved 30 articles for 2025-09-27
 Scraping 2025-09-26...
✅ Saved 30 articles for 2025-09-26
 Scraping 2025-09-25...
✅ Saved 30 articles for 2025-09-25
 Scraping 2025-09-24...
✅ Saved 30 articles for 2025-09-24
 Scraping 2025-09-23...
✅ Saved 30 articles for 2025-09-23
 Scraping 2025-09-22...
✅ Saved 30 articles for 2025-09-22
 Scraping 2025-09-21...
❌ Error fetching https://markets.businessinsider.com/news/archive/2025/9/21?p=2: HTTPSConnectionPool(host='markets.businessinsider.com', port=443): Read timed out. (read timeout=7)
✅ Saved 10 articles for 2025-09-21
 Scraping 2025-09-20...
✅ Saved 30 articles for 2025-09-20
 Scraping 2025-09-19...
✅ Saved 30 articles for 2025-09-19
 Scraping 2025-09-18...
✅ Saved 30 articles for 2025