In [None]:
# ---------------- Imports ----------------
import requests
import os
import urllib.parse
import sys

from datetime import datetime, timezone
from datetime import datetime

import pandas as pd
import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


# Specify the directory where files should be saved
save_directory = f"{data_folder}/raw_data/machine_collected/wikinews/"



In [None]:
# ---------------- Setup ----------------
# Define User-Agent
user_agent = "MyWikiNewsScraper/1.0 (myemail@example.com)"

# API endpoint
api_url = "https://en.wikinews.org/w/api.php"

def fetch_category_articles(category):
    """Fetches all article titles and their links from a specific Wikinews category."""
    articles = []
    params = {
        "action": "query", 
        "format": "json",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": "500"  # Maximum limit per request
    }

    while True:
        response = requests.get(api_url, headers={"User-Agent": user_agent}, params=params)
        data = response.json()

        if "query" in data and "categorymembers" in data["query"]:
            for article in data["query"]["categorymembers"]:
                title = article["title"]
                link = f"https://en.wikinews.org/wiki/{title.replace(' ', '_')}"
                articles.append({"title": title, "document_link": link})

        # Check for pagination
        if "continue" in data:
            params["cmcontinue"] = data["continue"]["cmcontinue"]  # Get the next batch
        else:
            break  # No more results

    return articles

# List of categories to process
categories = ["Interview/in-person", "Interview/video", "Interview/audio", "Interview/phone"]

# Aggregate all articles
all_articles_with_category = []

for category in categories:
    category_articles = fetch_category_articles(category)
    all_articles_with_category.extend(category_articles)

# Get the current retrieval date and time in UTC
retrieved_datetime = datetime.now(timezone.utc).strftime("%Y-%m-%d")

# Create a DataFrame
df = pd.DataFrame(all_articles_with_category)


# Remove duplicates based on 'document_link' and reset index
df = df.drop_duplicates(subset=["document_link"]).reset_index(drop=True)

# Display the cleaned DataFrame
display(df)



In [None]:
# Define a User-Agent
user_agent = "MyWikiNewsScraper/1.0 (myemail@example.com)"

# API endpoint
api_url = "https://en.wikinews.org/w/api.php"

def extract_titles_from_links(df):
    """Extracts and decodes Wikinews article titles from URLs, keeping the original title from df."""
    titles = {}
    for _, row in df.iterrows():
        link = row["document_link"]
        original_title = row["title"]  # Preserve the original title from df
        if "wikinews.org/wiki/" in link:
            raw_title = link.split("wiki/")[-1]  # Extract title after "wiki/"
            decoded_title = urllib.parse.unquote(raw_title)  # Decode special characters
            formatted_title = decoded_title.replace(" ", "_")  # For filename
            titles[original_title] = (link, formatted_title)  # Store original title with link & formatted title
    return titles

def save_text_to_file(directory, formatted_title, title):
    """Downloads and saves the plain text content of a Wikinews article."""
    # Ensure the directory exists
    os.makedirs(directory, exist_ok=True)

    # Sanitize filename
    filename = formatted_title.replace("/", "-") + ".txt"
    file_path = os.path.join(directory, filename)

    # Get the current date
    retrieved_date = datetime.now().strftime("%Y-%m-%d")

    # Fetch the article content in plain text format
    params = {
        "action": "query",
        "format": "json",
        "prop": "extracts",
        "titles": title,
        "explaintext": 1  # Get plain text instead of HTML
    }

    response = requests.get(api_url, headers={"User-Agent": user_agent}, params=params)

    if response.status_code == 200:
        data = response.json()
        pages = data.get("query", {}).get("pages", {})

        for page_id, page_data in pages.items():
            if "extract" in page_data:
                article_text = page_data["extract"]
                with open(file_path, "w", encoding="utf-8") as file:
                    file.write(article_text)
                print(f"Saved: {file_path}")
                return filename, retrieved_date

    print(f"Failed to retrieve: {title}")
    return None, None


# Extract article titles and URLs from links
article_data = extract_titles_from_links(df)

# Store metadata
metadata = []

# Retrieve and save text files
for original_title, (link, formatted_title) in article_data.items():
    original_file_name, retrieved_date = save_text_to_file(save_directory, formatted_title, original_title)
    
    if original_file_name and retrieved_date:
        metadata.append({
            "title": original_title,  # Preserve original title from df
            "document_link": link,
            "retrieved_date": retrieved_date,
            "original_file_name": original_file_name
        })

# Save updated metadata to CSV
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv(f"{save_directory}/metadata.csv", index=False)

# Display metadata
display(metadata_df)

