In [None]:
#!/usr/bin/env python3
"""
scraper.py

Web scraper for the Belief Spiral – Conspiracy Recommender Engine project.
Collects text snippets from:
  - Reddit (r/conspiracy)
  - Wikipedia (List of conspiracy theories)
  - Gab (public posts)
  - Satirical sites (e.g., The Onion)

Requirements:
  pip install praw requests beautifulsoup4 python-dotenv

Usage:
  1. Create a .env file with:
       REDDIT_CLIENT_ID=your_client_id
       REDDIT_CLIENT_SECRET=your_client_secret
       REDDIT_USER_AGENT=BeliefSpiralScraper/0.1
  2. Run: python scraper.py
  3. Output: conspiracy_snippets.json
"""

import os
import time
import json
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

try:
    import praw
except ImportError:
    raise ImportError("Please install praw: pip install praw")

# Load credentials from .env
load_dotenv()
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT", "BeliefSpiralScraper/0.1")

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

def scrape_reddit(subreddit_name: str, limit: int = 500):
    """Scrape titles, bodies, and comments from a subreddit."""
    snippets = []
    subreddit = reddit.subreddit(subreddit_name)
    for submission in subreddit.hot(limit=limit):
        # Title & body
        snippets.append(submission.title)
        if submission.selftext:
            snippets.append(submission.selftext)
        # Comments
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            snippets.append(comment.body)
        time.sleep(0.5)  # be kind to Reddit
    return snippets

def scrape_wikipedia(url: str = "https://en.wikipedia.org/wiki/List_of_conspiracy_theories"):
    """Scrape list items (with summaries) from a Wikipedia page."""
    snippets = []
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    items = soup.select(".mw-parser-output > ul > li")
    for li in items:
        text = li.get_text(" ", strip=True)
        if len(text) > 60:
            snippets.append(text)
    return snippets

def scrape_gab(topic_url: str = "https://gab.com/explore", limit: int = 300):
    """Scrape public posts from a Gab topic page."""
    snippets = []
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(topic_url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    posts = soup.select("p.post__content")
    for p in posts[:limit]:
        txt = p.get_text(" ", strip=True)
        if txt:
            snippets.append(txt)
    return snippets

def scrape_satire(source_url: str = "https://www.theonion.com/tag/conspiracy", limit: int = 100):
    """Scrape satirical paragraphs to balance tone."""
    snippets = []
    resp = requests.get(source_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    paras = soup.select("div.js_post-content p")
    for p in paras[:limit]:
        text = p.get_text(" ", strip=True)
        if text:
            snippets.append(text)
    return snippets

def main():
    print("Scraping Reddit...")
    reddit_snips = scrape_reddit("conspiracy", limit=500)

    print("Scraping Wikipedia...")
    wiki_snips = scrape_wikipedia()

    print("Scraping Gab...")
    gab_snips = scrape_gab(limit=300)

    print("Scraping satire...")
    sat_snips = scrape_satire(limit=200)

    combined = set(reddit_snips + wiki_snips + gab_snips + sat_snips)
    print(f"Total unique snippets: {len(combined)}")

    with open("conspiracy_snippets.json", "w", encoding="utf-8") as f:
        json.dump(list(combined), f, ensure_ascii=False, indent=2)
    print("Saved to conspiracy_snippets.json")

if __name__ == "__main__":
    main()

# pip3 install praw requests beautifulsoup4 python-dotenv