In [5]:
import json
from pathlib import Path

# Define the input and output paths as lists
input_paths = [
    Path("C:/Users/24419222_admin/Documents/GitHub/Timetravel/data/raw/test_data.json"),
    Path("C:/Users/24419222_admin/Documents/GitHub/Timetravel/data/raw/dev_data.json")
]
output_paths = [
    Path("C:/Users/24419222_admin/Documents/GitHub/Timetravel/data/transformed/test_data.json"),
    Path("C:/Users/24419222_admin/Documents/GitHub/Timetravel/data/transformed/dev_data.json")
]

def read_and_print_first_5_rows(file_path):
    """Reads a file and prints its first 5 rows."""
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            if i < 5:  # Print first 5 rows
                print(json.loads(line.strip()))
            else:
                break

def get_row_count(file_path):
    """Returns the number of rows in a file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return sum(1 for _ in file)

def transform_data(input_path, output_path):
    transformed_stories = []

    # Read the data
    with open(input_path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            story = json.loads(line.strip())
            # For each group of edited endings, create a new story entry
            for edited_ending_group in story['edited_endings']:
                new_story = {
                    "story_id": story['story_id'],
                    "premise": story['premise'],
                    "initial": story['initial'],
                    "counterfactual": story['counterfactual'],
                    "original_ending": story['original_ending'],
                    "edited_ending": edited_ending_group
                }
                transformed_stories.append(new_story)

    # Write the transformed data
    with open(output_path, 'w', encoding='utf-8') as output_file:
        for story in transformed_stories:
            output_file.write(json.dumps(story) + '\n')

    print(f"Transformed data saved to {output_path}")

# Iterate over the list of file paths
for input_path, output_path in zip(input_paths, output_paths):
    # Ensure the output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)
    # Call the function to transform the data
    transform_data(input_path, output_path)
    # Get and print row count for input and output files
    input_row_count = get_row_count(input_path)
    output_row_count = get_row_count(output_path)
    print(f"Input file {input_path} has {input_row_count} rows.")
    print(f"Output file {output_path} has {output_row_count} rows.")
    # Print first 5 rows of the output file
    print(f"First 5 rows of {output_path}:")
    read_and_print_first_5_rows(output_path)
    print("\n")  # Add a new line for readability between files


Transformed data saved to C:\Users\24419222_admin\Documents\GitHub\Timetravel\data\transformed\test_data.json
Input file C:\Users\24419222_admin\Documents\GitHub\Timetravel\data\raw\test_data.json has 1871 rows.
Output file C:\Users\24419222_admin\Documents\GitHub\Timetravel\data\transformed\test_data.json has 5613 rows.
First 5 rows of C:\Users\24419222_admin\Documents\GitHub\Timetravel\data\transformed\test_data.json:
{'story_id': '42b12f6d-811e-4a0f-bd1f-5d7fdde74973', 'premise': 'The soccer game was tied 3 to 3 and there was a minute left to play.', 'initial': 'Julie had never scored a goal yet, but knew today would be her day.', 'counterfactual': 'Julie was eagerly watching the game in the stands.', 'original_ending': "Ashley passed her the ball and this was chance. She kicked as hard as she could, and the ball soared into the net. Julie's first goal won the game.", 'edited_ending': ['Ashley passed the ball and this their was chance.', 'She kicked as hard as she could, and the bal