In [None]:
# ---------------- Imports ----------------
import os
import requests
import re
import sys

from bs4 import BeautifulSoup

import pandas as pd
import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

input_directory = f"{data_folder}/raw_data/machine_collected/voa_news"
metadata_df = pd.read_csv(f"{input_directory}/metadata.csv")

output_directory = f"{data_folder}/intermediate_data/01_txt_files/voa_news"
os.makedirs(output_directory, exist_ok=True)

In [None]:
# ---------------- Setup ----------------
# skip
titles_to_skip = [

    "112580.html",
    "114580.html",
    "268513.html",
    "2732030.html",
    "2863112.html",
    "2871518.html",
    "2985297.html",
    "3266622.html",
    "344951.html",
    "374633.html",
    "376070.html",
    "376245.html",
    "383312.html",
    "387525.html",
    "389494.html",
    "394878.html",
    "398149.html",
    "403086.html",
    "4194017.html",
    "4212372.html",
    "4322596.html",
    "4408188.html",
    "4438375.html",
    "4518257.html",
    "4525809.html",
    "542128.html",
    "550425.html",
    "552689.html",
    "6104227.html",
    "6104382.html",
    "6104700.html",
    "6106248.html",
    "6106249.html",
    "6110275.html",
    "6110570.html",
    "6110571.html",
    "6110909.html",
    "6110921.html",
    "6111199.html",
    "6112074.html",
    "6115793.html",
    "6174254.html",
    "6191106.html",
    "6191758.html",
    "6194626.html",
    "6402952.html",
    "7931365.html",
    "7024481.html",
    "7018988.html",
    "7107157.html",
    "7199021.html",
    "7288160.html",
    "7305324.html",
    "7328757.html",
    "7849048.html",
    "7904825.html",
    "2761200.html",
    "4795487.html",
    "6735421.html",
    "6827986.html",
    "6867451.html",
    "6895900.html",
    "6906731.html",
    "6913900.html",
    "6948475.html",
    "6958471.html",
    "6997667.html",
    "7031781.html",
    "7356091.html",
    "7532913.html",
    "151345.html",
    "388818.html",
    "4254944.html",
    "551592.html",
    "6432001.html",
    "6658514.html",
    "6759424.html",
    "6850939.html",
    "6906747.html",
    "6995149.html",
    "7057905.html",
    "6546950.html",
    "6877826.html",

]




In [None]:
# Extract folder name from input path for naming convention
folder_name = os.path.basename(input_directory)


def extract_voa_article_from_html(file_path, metadata, success_counter):
    """Extracts and saves VOA News article text from an HTML file with metadata."""
    try:
        # Check if the title exists in metadata
        article_title = metadata.get("original_file_name", "").strip()

        if article_title in titles_to_skip:
            print(f"Skipping '{article_title}' as it is in the skip list.")
            return False  # Skip this file

        # Read the HTML content
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

        # Extract the article content
        article_body = soup.find("div", class_="wsw")
        if article_body:
            paragraphs = [p.get_text("\n", strip=True) for p in article_body.find_all("p")]
            article_text = "\n".join(paragraphs)  # Ensure proper text formatting

        if not article_text:
            print(f"Warning: No text found in {file_path}")
            return False  # Process not successful, do not increase counter

        # Format metadata
        metadata_text = "--- metadata ---\n"
        metadata_text += "\n".join(f"{key}: {value}" for key, value in metadata.items())

        # Add spacing and the `--- dialogue ---` tag
        formatted_text = f"{metadata_text}\n\n--- dialogue ---\n\n{article_text}"

        # Generate filename with folder name and 5-digit counter
        filename = os.path.join(output_directory, f"{folder_name}_{success_counter:05d}.txt")

        # Save to a .txt file
        with open(filename, "w", encoding="utf-8") as file:
            file.write(formatted_text)

        print(f"Saved: {filename}")
        return True  # Successful processing

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False  # Process failed, do not increase counter

# Initialize success counter
success_counter = 0

# Process all HTML files in the input directory
for file_name in sorted(os.listdir(input_directory)):
    if file_name.endswith(".html"):
        file_path = os.path.join(input_directory, file_name)

        # Find the corresponding metadata entry
        matching_row = metadata_df[metadata_df['document_link'].str.contains(file_name, na=False)]
        metadata = matching_row.to_dict(orient="records")[0] if not matching_row.empty else {}

        # Only increase counter if processing is successful
        if extract_voa_article_from_html(file_path, metadata, success_counter):
            success_counter += 1

