In [None]:
# ---------------- Imports ----------------
import os
import requests
import time
import sys

from datetime import datetime

import pandas as pd
import yaml

from bs4 import BeautifulSoup



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


# Directory to save HTML files
output_dir = f"{data_folder}/raw_data/machine_collected/voa_news"
os.makedirs(output_dir, exist_ok=True)



In [None]:
# ---------------- Setup ----------------
# Define the range of pages to scrape
START_PAGE = 1  
END_PAGE = 21   

BASE_URL = "https://www.voanews.com/s?k=%22VOA+Interview%22&tab=any-content&pi={}&r=any&pp=50"



def get_articles_from_page(page_number):
    url = BASE_URL.format(page_number)
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    
    if response.status_code != 200:
        print(f"Failed to fetch page {page_number}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = []

    # Find all article containers
    for article in soup.find_all("li", class_="fui-grid__inner"):
        title_element = article.find("h4", class_="media-block__title")
        link_element = article.find("a", title=True)  # Works for both article structures
        summary_element = article.find("p", class_="perex")
        date_element = article.find("span", class_="date")
        author_element = article.find("a", class_="links__item-link")

        if title_element and link_element:
            title = title_element.get_text(strip=True)
            link = "https://www.voanews.com" + link_element["href"]
            summary = summary_element.get_text(strip=True) if summary_element else ""
            date = date_element.get_text(strip=True) if date_element else "Unknown Date"
            author = author_element.get_text(strip=True) if author_element else "Unknown Author"

            articles.append({
                "title": title,
                "document_link": link,
                "summary": summary,
                "date": date,
                "author": author
            })
    
    return articles



In [None]:
# Scrape within the specified page range
all_articles = []

for page_number in range(START_PAGE, END_PAGE + 1):
    print(f"Fetching page {page_number}...")
    articles = get_articles_from_page(page_number)
    
    if not articles:
        print(f"No articles found on page {page_number}. Skipping to next.")
        continue

    all_articles.extend(articles)
    time.sleep(2)  # Pause to avoid rate limiting

# Store results
df = pd.DataFrame(all_articles)

# Display the DataFrame
display(df)



In [None]:
df_subset = df[df['title'].str.contains(r'\bVOA interview\b', case=False, na=False, regex=True)].reset_index(drop=True)

display(df_subset)

#df_subset.to_csv(f'{output_dir}/metadada.csv', index=False)

In [None]:
def save_voa_html(row):
    url = row['document_link']
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure request was successful

        # Extract filename and ensure single .html extension
        base_filename = url.split("/")[-1].split(".html")[0]  # Remove existing .html if present
        original_filename = f"{base_filename}.html"
        file_path = os.path.join(output_dir, original_filename)

        # Save HTML content
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        print(f"Saved: {file_path}")

        return pd.Series([original_filename, datetime.now().strftime('%Y-%m-%d')])  # Return filename & retrieval date

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return pd.Series([None, None])

# Apply function and update metadata
df_subset[['original_file_name', 'retrieved_date']] = df_subset.apply(save_voa_html, axis=1)

# Display updated DataFrame
display(df_subset)

# Save updated metadata
df_subset.to_csv(f'{output_dir}/metadata.csv', index=False)

