In [None]:
# Initial exploroation over API call to Wikipedia
''' 
    .venv should contain ipykernel
    .venv\Scripts\activate.bat  # Activate your .venv
    pip install ipykernel
    
    once .venv setup is complete, setup kernel locally
    python -m ipykernel install --user --name=.venv --display-name="Python 3.13 (.venv)"
'''

In [1]:
import requests
import duckdb
import pandas as pd
from datetime import datetime
import os
import logging
from typing import Dict, List

In [4]:
# check api call 
url = "https://en.wikipedia.org/w/api.php"

params = {
    "action": "query",
    "format": "json",
    "list": "featured",
    "limit": 50  # adjustable
}

response = requests.get(url, params=params)
response.raise_for_status()

# print(response)  # got 200

# test_result = response.json()["query"]["featured"]

<Response [200]>


In [None]:
response = requests.get(url, params=params)
response.raise_for_status()

In [None]:
response.json()["query"]["featured"]

In [None]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class WikipediaETL:
    def __init__(self, db_path: str = 'wikipedia.duckdb'):
        """Initialize the ETL pipeline with database connection."""
        self.db_path = db_path
        self.conn = duckdb.connect(db_path)
        self.setup_database()
    
    def setup_database(self):
        """Create the necessary tables if they don't exist."""
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS articles (
                page_id INTEGER PRIMARY KEY,
                title VARCHAR,
                url VARCHAR,
                extract TEXT,
                categories TEXT[],
                last_updated TIMESTAMP,
                word_count INTEGER
            )
        """)
        
    def get_featured_articles(self) -> List[Dict]:
        """Fetch featured articles from Wikipedia API."""
        url = "https://en.wikipedia.org/w/api.php"
        
        params = {
            "action": "query",
            "format": "json",
            "list": "featured",
            "limit": 50  # Adjust as needed
        }
        
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        return response.json()["query"]["featured"]
    
    def get_article_details(self, title: str) -> Dict:
        """Fetch detailed information about an article."""
        url = "https://en.wikipedia.org/w/api.php"
        
        params = {
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts|categories|info",
            "exintro": True,
            "explaintext": True,
            "inprop": "url",
            "cllimit": "50"
        }
        
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        page = list(response.json()["query"]["pages"].values())[0]
        
        # Extract categories
        categories = [cat["title"].replace("Category:", "") 
                     for cat in page.get("categories", [])]
        
        return {
            "page_id": page["pageid"],
            "title": page["title"],
            "url": page["fullurl"],
            "extract": page.get("extract", ""),
            "categories": categories,
            "last_updated": datetime.now(),
            "word_count": len(page.get("extract", "").split())
        }
    
    def extract_and_load(self):
        """Main ETL process."""
        try:
            # Get featured articles
            logger.info("Fetching featured articles...")
            featured_articles = self.get_featured_articles()
            
            # Get details for each article
            articles_data = []
            for article in featured_articles:
                logger.info(f"Processing article: {article['title']}")
                article_details = self.get_article_details(article["title"])
                articles_data.append(article_details)
            
            # Convert to DataFrame
            df = pd.DataFrame(articles_data)
            
            # Load into DuckDB
            logger.info("Loading data into DuckDB...")
            self.conn.execute("""
                INSERT INTO articles 
                SELECT * FROM df
                ON CONFLICT (page_id) DO UPDATE SET
                    extract = EXCLUDED.extract,
                    categories = EXCLUDED.categories,
                    last_updated = EXCLUDED.last_updated,
                    word_count = EXCLUDED.word_count
            """)
            
            logger.info("ETL process completed successfully!")
            
        except Exception as e:
            logger.error(f"Error during ETL process: {str(e)}")
            raise
    
    def close(self):
        """Close the database connection."""
        self.conn.close()

if __name__ == "__main__":
    etl = WikipediaETL()
    etl.extract_and_load()
    etl.close()