In [None]:
# ---------------- Imports ----------------
import os
import requests
import re
import sys

import pandas as pd
import yaml

from bs4 import BeautifulSoup



In [None]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

data_folder = os.path.join(config["paths"]["proj_store"], "data")


input_directory = f"{data_folder}/raw_data/machine_collected/jsc_oral_history"
updated_df = pd.read_csv(f"{input_directory}/metadata.csv")

output_directory = f"{data_folder}/intermediate_data/01_txt_files/jsc_oral_history"
os.makedirs(output_directory, exist_ok=True)


In [None]:
# ---------------- Setup ----------------
# Initialize a counter
counter = 0
folder_name = os.path.basename(output_directory)



In [None]:
# Loop through each URL and extract text
for index, row in updated_df.iterrows():
    
    original_filename = row['original_file_name']

    # Convert .htm to .html to match the stored files
    base_filename = os.path.splitext(original_filename)[0]  # Removes .htm
    stored_filename = f"{base_filename}.html"  # Ensures we look for .html files

    html_file_path = os.path.join(input_directory, stored_filename)

    try:
        # Read the local HTML file
        with open(html_file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")


        
        # Prepare metadata section
        metadata_lines = ["--- metadata ---"]
        for column in updated_df.columns:
            metadata_lines.append(f"{column}: {row[column]}")
        
        



        # Loop through all <tbody> elements until we find one containing <h4> or <h5>
        for tbody in soup.find_all("tbody"):
            h4_h5_texts = []
            
            for tag in tbody.find_all(["h4", "h5"]):
                # Convert <br> to a temporary placeholder before extracting text
                for br in tag.find_all("br"):
                    br.replace_with(" ||| ")  # Using '|||' as a temporary separator

                # Extract text with placeholders for proper splitting
                text = tag.get_text(" ", strip=True).strip(" |||")  # Ensure words remain together
                text = re.sub(r"\s+", " ", text).strip()  # Remove excessive spaces

                # Process the text and split where needed
                parts = text.split(" ||| ")
                cleaned_text = []
                for part in parts:
                    part = part.strip()
                    if cleaned_text and re.match(r"^[A-Z][a-z]+\s[A-Z]\.", part):  
                        
                        cleaned_text[-1] += " " + part 
                    else:
                        cleaned_text.append(part) 

                if cleaned_text:  
                    h4_h5_texts.append("\n".join(cleaned_text))

            if h4_h5_texts:  
                h4_h5_texts = [line for line in h4_h5_texts if line.strip()] 
                metadata_lines.extend(h4_h5_texts) 
                break 

        

        # Replace <strong>Speaker:</strong> with <SPEAKER>Speaker:</SPEAKER>
        for strong_tag in soup.find_all("strong"):
            if strong_tag.text.strip().endswith(":"):
                speaker_name = strong_tag.text.strip().rstrip(":")  
                strong_tag.string = f"<SPEAKER>{speaker_name}</SPEAKER>"

        # Remove all <a> tags but keep their content
        for a_tag in soup.find_all("a"):
            a_tag.unwrap()

        # Extract text from the modified HTML
        text = soup.get_text(separator="\n", strip=True)

        # Remove everything after "[End of interview]" with flexible matching
        end_pattern = re.compile(r"\[\s*End\s+of\s+interview\s*\]", re.IGNORECASE)
        match = end_pattern.search(text)
        if match:
            text = text[: match.start()]  # Cut off everything from [End of interview] onward



        # Convert text into a list of lines
        text_lines = text.split("\n")

        # Find the first <SPEAKER> tag and determine where to insert "--- dialogue ---"
        for i, line in enumerate(text_lines):
            if line.startswith("<SPEAKER>"):
                dialogue_insert_index = i  # Position where "--- dialogue ---" will be inserted

                # Extract the lines before the first <SPEAKER>
                pre_speaker_lines = text_lines[:i]

                ## Process the pre-SPEAKER lines
                new_pre_speaker_lines = []
                for pre_line in pre_speaker_lines:
                    stripped_line = pre_line.strip()


                ## Replace text_lines before <SPEAKER> with only the cleaned-up version
                text_lines = new_pre_speaker_lines + text_lines[i:]

                # Insert "--- dialogue ---" before the first <SPEAKER>"
                text_lines.insert(len(new_pre_speaker_lines), "\n\n--- dialogue ---")

                break 

                
                
        # Consolidate metadata and text before processing further
        formatted_metadata = "\n".join(metadata_lines)  # Convert metadata list to a string
        
        formatted_metadata = re.sub(r"(?i)Interviewed by ", "", formatted_metadata)  # Case-insensitive replace
        #print(metadata_lines)
        
        
        formatted_text = "\n".join(text_lines)  # Convert cleaned text to a string
        
        
        # Remove lines that start with ": " (preserving other content)
        formatted_text = re.sub(r"^:\s+", "", formatted_text, flags=re.MULTILINE)

        # Combine metadata and main text before modifying the metadata section
        text = f"{formatted_metadata}\n\n{formatted_text}"
                


        # Titles to prepend
        titles = [
            "subcollection: ",
            "subtitle: ",
            "subtitle_interviewee: ",
            "interviewers: ",
            "location_date: "
        ]

        # Split text into sections
        sections = text.split("--- dialogue ---")
        metadata_section = sections[0].strip()  # Ensure no leading/trailing spaces
        dialogue_section = "--- dialogue ---" + sections[1] if len(sections) > 1 else ""

        # Extract metadata lines
        metadata_lines = metadata_section.splitlines()
        metadata_header = metadata_lines[0]  # Preserve the "--- metadata ---" header
        metadata_content = metadata_lines[1:]  # Actual metadata lines

        # Ensure at least 5 lines exist
        if len(metadata_content) >= 5:
            # Modify the last 5 lines by prepending the corresponding variable titles
            for i in range(5):
                metadata_content[-5 + i] = titles[i] + metadata_content[-5 + i]

        
        # Replace " and " with ", " only in the interviewers line
        for i, line in enumerate(metadata_content):
            if line.startswith("interviewers: "):
                metadata_content[i] = line.replace(" and ", ", ")
                
        # Reconstruct the metadata section
        new_metadata_section = "\n".join([metadata_header] + metadata_content)
        
        

        # Reconstruct the final text
        text = f"{new_metadata_section}\n\n{dialogue_section}"
        

        # Ensure text_lines is extracted properly from text
        text_lines = text.splitlines()  # Splitting text into a list of lines
                
    
        
    
        # Post-process text_lines to format dialogue properly
        processed_lines = []
        inside_dialogue = False  # Flag to track when we are inside the dialogue section

        for line in text_lines:
            stripped_line = line.strip()

            # Detect when dialogue starts
            if stripped_line == "--- dialogue ---":
                inside_dialogue = True
                processed_lines.append(line)  # Keep the "--- dialogue ---" line as is
                continue

            # If inside the dialogue, format correctly
            if inside_dialogue:
                if stripped_line.startswith("<SPEAKER>"):  
                    # Ensure a blank line before each speaker tag
                    processed_lines.append("\n" + stripped_line)
                elif stripped_line:  
                    # If the line does NOT start at the beginning, remove leading spaces and merge
                    if line.startswith(" "):  
                        processed_lines[-1] += " " + stripped_line  # Merge into previous line
                    else:
                        processed_lines.append(stripped_line)  # Start a new line normally
            else:
                processed_lines.append(line)  # Preserve metadata as is

        # Convert the cleaned-up lines back into a full formatted text
        formatted_text = "\n".join(processed_lines)
        
        # Combine everything into a structured format
        #formatted_text = "\n".join(metadata_lines) + "\n" + "\n".join(processed_lines)
        
        
        # Add literal \n\n after each utterance line (non-empty, non-speaker) inside the dialogue section
        lines = formatted_text.splitlines()
        new_lines = []
        inside_dialogue = False

        for i, line in enumerate(lines):
            stripped = line.strip()
            if stripped == "--- dialogue ---":
                inside_dialogue = True
                new_lines.append(line)
                continue

            if inside_dialogue:
                if stripped.startswith("<SPEAKER>"):
                    new_lines.append(line)
                elif stripped != "":
                    # check next non-empty line
                    next_nonempty = ""
                    for j in range(i + 1, len(lines)):
                        next_line = lines[j].strip()
                        if next_line != "":
                            next_nonempty = next_line
                            break

                    if not next_nonempty.startswith("<SPEAKER>") and next_nonempty != "":
                        # add \n\n only if the next nonempty line is NOT a speaker tag
                        new_lines.append(f"{line}\\n\\n")
                    else:
                        new_lines.append(line)
                else:
                    new_lines.append(line)  # preserve empty lines
            else:
                new_lines.append(line)  # preserve metadata

        formatted_text = "\n".join(new_lines)

        
    

        # Generate the new filename with the folder name and a five-digit counter
        new_filename = f"{folder_name}_{counter:05d}.txt"
        counter += 1  # Increment the counter

        # Save the extracted text with metadata
        filename = os.path.join(output_directory, new_filename)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(formatted_text)
        
        print(f"Saved text from {original_filename} to {filename}")

    except FileNotFoundError:
        print(f"File not found: {html_file_path}. Skipping...")
        continue  # Skip to the next file
    except Exception as e:
        print(f"Unexpected error processing {html_file_path}: {e}")
        continue

