In [None]:
# ---------------- Imports ----------------
import os
import requests
import re  
import sys

from bs4 import BeautifulSoup

import pandas as pd
import yaml



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")

input_directory = f"{data_folder}/raw_data/machine_collected/wikinews"
metadata_df = pd.read_csv(f"{input_directory}/metadata.csv")

output_directory = f"{data_folder}/intermediate_data/01_txt_files/wikinews"
os.makedirs(output_directory, exist_ok=True)

In [None]:
# ---------------- Setup ----------------
# skip
filenames_to_skip = [
    "Brothers_Sunshine_Coast_to_join_A_grade_rugby_union_competition_on_Australia's_Sunshine_Coast.txt",
    "Grand_Finals_set_in_2021-22_Sunshine_Coast,_Australia_cricket_season.txt",
    "Lobby_groups_oppose_plans_for_EU_copyright_extension.txt",
    "Maroochydore_pick_up_first_win_in_round_5_Australia's_Sunshine_Coast_Rugby_Union_2021_season.txt",
    "Wikinews_interviews_Australian_sit_skier_Victoria_Pendergast.txt",
    "Wikinews_Interviews_Australian_wheelchair_basketball_player_Caitlin_de_Wit.txt",
    "Wikinews_interviews_Great_Britain_men's_national_wheelchair_basketball_player_Ade_Oregembe.txt",
    "Wikinews_interviews_Great_Britain_men's_national_wheelchair_basketball_player_Joni_Pollock.txt",
    "Wikinews_interviews_Spanish_para-alpine_skier_Úrsula_Pueyo.txt",
    "Wikinews_interviews_Spanish_Paralympic_swimmer_María_Delgado.txt",
    "Wynnum_defeat_Maroochydore_in_round_4_of_Australia's_Sunshine_Coast_Rugby_Union_2021_season.txt",
    "'Top_Model'_winner_Jaslene_Gonzalez_on_her_career_and_being_a_Latina_role_model.txt",
  
]



In [None]:
# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Extract folder name for naming convention
folder_name = os.path.basename(output_directory)

def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    
    first_line = lines[0].strip() if lines else ""  # Get the first line (if any)
    remaining_text = "".join(lines[1:]).strip()  # Get the rest of the text
    
    return first_line, remaining_text

def save_article_to_file(directory, metadata, first_line, text, index):
    os.makedirs(directory, exist_ok=True)
    
    # Generate filename using folder name and a five-digit index
    filename = f"{folder_name}_{index:05d}.txt"
    file_path = os.path.join(directory, filename)

    with open(file_path, "w", encoding="utf-8") as file:
        file.write("--- metadata ---\n")
        
        # Write all metadata dynamically
        for key, value in metadata.items():
            file.write(f"{key}: {value}\n")
        
        # Write the first line of the article with "published date: " prefix
        if first_line:
            file.write(f"published date: {first_line}\n")

        file.write("targets: \n")
        file.write("interviewers: \n\n\n")
        file.write("--- dialogue ---\n\n")
        
        if text:
            file.write(text)

    print(f"Saved: {file_path}")

# Counter for successful files
successful_count = 0

# Process each row in metadata
for _, row in metadata_df.iterrows():
    file_name = row["original_file_name"] 

    # Skip files listed in filenames_to_skip
    if file_name in filenames_to_skip:
        print(f"Skipping file: {file_name}")
        continue

    file_path = os.path.join(input_directory, file_name)

    if os.path.exists(file_path):
        first_line, article_text = extract_text_from_txt(file_path)  # Extract first line separately
        save_article_to_file(output_directory, row.to_dict(), first_line, article_text, successful_count)
        successful_count += 1  # Only increment after a successful save
    else:
        print(f"File not found: {file_path}")

