In [None]:
import os
import json
import csv
import re
from datetime import datetime
from pathlib import Path


def clean_text(text):
    """
    Clean and normalize text by removing HTML tags and extra whitespace.
    """
    # Handle case where text might be a list
    if isinstance(text, list):
        # If it's a list, join the elements with spaces
        text = ' '.join(str(item) for item in text)
    elif text is None:
        text = ''
    else:
        # Convert to string if it's not already
        text = str(text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    return text

def process_json_files(input_directory, output_csv):
    """
    Process JSON files from a directory and extract Recap and Transcript fields to CSV.
    """
    fieldnames = ['Recap', 'Transcript']
    
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        file_count = 0
        for filename in os.listdir(input_directory):
            if filename.endswith('.json'):
                filepath = os.path.join(input_directory, filename)
                with open(filepath, 'r', encoding='utf-8') as jsonfile:
                    data = json.load(jsonfile)
                    
                    title = data.get('Title', 'N/A')
                    date_str = data.get('Date', 'N/A')
                    try:
                        date = datetime.strptime(date_str, '%Y-%m-%d').date()
                    except ValueError:
                        date = 'N/A'
                    
                    recap = clean_text(data.get('Recap', 'N/A'))
                    transcript = clean_text(data.get('Transcript', 'N/A'))
                    
                    writer.writerow({
                        'Recap': recap,
                        'Transcript': transcript
                    })
                    file_count += 1
        
        print(f"Processed {file_count} JSON files from '{input_directory}' → '{output_csv}'")

In [None]:
# Process test dataset from ForeverDreaming directory
print("Processing test dataset...")
input_directory = 'ForeverDreaming'
output_csv = 'test.csv'
process_json_files(input_directory, output_csv)
print(f"✓ Test dataset saved to '{output_csv}'\n")

In [None]:
# Process training dataset from TvMegaSite directory
print("Processing training dataset...")
input_directory = 'TvMegaSite'
output_csv = 'train.csv'
process_json_files(input_directory, output_csv)
print(f"✓ Training dataset saved to '{output_csv}'\n")

print("All datasets processed successfully!")