# English Text Preprocessing

## Output Format:
For each text, we create a CSV with columns:
  - story_number
  - title
  - text (the story)
  - moral (if present)

In [4]:
import re
import pandas as pd
import os

# Create output directory
os.makedirs('processedDataEnglish', exist_ok=True)

## Step 1: Load the Raw Text

In [5]:
# Read the file
with open('dataEnglish/AesopsFables.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f"File loaded: {len(raw_text)} characters")
print(f"First 500 characters:")
print(raw_text[:500])

File loaded: 198696 characters
First 500 characters:
AESOP'S FABLES




GRAPES
THE FOX AND THE GRAPES

THE FOX AND THE GRAPES
A hungry Fox saw some fine bunches of Grapes hanging from a vine that was trained along a high trellis, and did his best to reach them by jumping as high as he could into the air. But it was all in vain, for they were just out of reach: so he gave up trying, and walked away with an air of dignity and unconcern, remarking, "I thought those Grapes were ripe, but I see now they are quite sour."





THE GOOSE THAT LAID THE GOL


## Step 2: Parse into Individual Fables

In [6]:
def parse_aesops_fables(text):
    """
    Parse Aesop's Fables text into structured format.
    
    Returns: list of dicts with {title, text, moral}
    """
    lines = text.split('\n')
    
    fables = []
    current_title = None
    current_text = []
    current_moral = None
    
    # Skip header
    start_idx = 0
    for i, line in enumerate(lines):
        if "AESOP'S FABLES" in line:
            start_idx = i + 1
            break
    
    i = start_idx
    while i < len(lines):
        line = lines[i].strip()
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Check if this is a title (ALL CAPS and contains multiple words)
        if line.isupper() and len(line.split()) >= 2:
            # Save previous fable if exists
            if current_title and current_text:
                fables.append({
                    'title': current_title,
                    'text': ' '.join(current_text),
                    'moral': current_moral
                })
            
            # Start new fable
            current_title = line
            current_text = []
            current_moral = None
        
        # Regular text line (part of story)
        elif current_title:
            current_text.append(line)
        
        i += 1
    
    # Save last fable
    if current_title and current_text:
        fables.append({
            'title': current_title,
            'text': ' '.join(current_text),
            'moral': current_moral
        })
    
    return fables

# Parse the fables
fables = parse_aesops_fables(raw_text)
print(f"\nParsed {len(fables)} fables")
print(f"\nFirst fable:")
print(f"Title: {fables[0]['title']}")
print(f"Text: {fables[0]['text'][:200]}...")
print(f"Moral: {fables[0]['moral']}")


Parsed 284 fables

First fable:
Title: THE FOX AND THE GRAPES
Text: A hungry Fox saw some fine bunches of Grapes hanging from a vine that was trained along a high trellis, and did his best to reach them by jumping as high as he could into the air. But it was all in va...
Moral: None


## Step 3: Extract Morals

Many fables end with a moral. These are often:
- Short sentences (< 100 characters)
- Come after the main story
- Sometimes italicized or set apart

We'll use heuristics to extract them.

In [7]:
def extract_moral(text):
    """
    Try to extract the moral from the end of the fable.
    
    Heuristic: Last sentence if it's short and sounds like a moral.
    """
    sentences = text.split('. ')
    if len(sentences) < 2:
        return None, text
    
    last_sentence = sentences[-1].strip()
    
    # Check if last sentence is likely a moral:
    # - Relatively short (< 150 chars)
    # - Doesn't start with character names (capital letters followed by lowercase)
    # - Contains wisdom keywords
    
    moral_keywords = ['always', 'never', 'often', 'wise', 'better', 'beware', 
                     'fool', 'should', 'must', 'learns', 'loses', 'gains']
    
    if (len(last_sentence) < 150 and 
        any(keyword in last_sentence.lower() for keyword in moral_keywords)):
        # This is likely a moral
        story_text = '. '.join(sentences[:-1])
        return last_sentence, story_text
    
    # No moral detected
    return None, text

# Extract morals from all fables
for fable in fables:
    moral, story = extract_moral(fable['text'])
    fable['moral'] = moral
    fable['text'] = story

# Count fables with morals
fables_with_morals = sum(1 for f in fables if f['moral'])
print(f"Fables with extracted morals: {fables_with_morals}/{len(fables)}")

# Show examples
print("\nExamples with morals:")
for i, fable in enumerate(fables[:5]):
    if fable['moral']:
        print(f"\n{i+1}. {fable['title']}")
        print(f"   Moral: {fable['moral']}")

Fables with extracted morals: 25/284

Examples with morals:

2. THE GOOSE THAT LAID THE GOLDEN EGGS
   Moral: Much wants more and loses all.


## Step 4: Clean and Normalize Text


In [8]:
def clean_text(text):
    """
    Clean English text.
    """
    if not text:
        return text
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Fix punctuation spacing
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([A-Za-z])', r'\1 \2', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

# Clean all fables
for fable in fables:
    fable['title'] = clean_text(fable['title'])
    fable['text'] = clean_text(fable['text'])
    if fable['moral']:
        fable['moral'] = clean_text(fable['moral'])

print("Text cleaned")

Text cleaned


## Step 5: Create Structured DataFrame

In [9]:
# Create DataFrame
df = pd.DataFrame(fables)
df.insert(0, 'fable_number', range(1, len(df) + 1))

print(f"DataFrame created: {len(df)} fables")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

DataFrame created: 284 fables

Columns: ['fable_number', 'title', 'text', 'moral']

First few rows:
   fable_number                                title  \
0             1               THE FOX AND THE GRAPES   
1             2  THE GOOSE THAT LAID THE GOLDEN EGGS   
2             3                 THE CAT AND THE MICE   
3             4                  THE MISCHIEVOUS DOG   
4             5   THE CHARCOAL-BURNER AND THE FULLER   

                                                text  \
0  A hungry Fox saw some fine bunches of Grapes h...   
1  A Man and his Wife had the good fortune to pos...   
2  There was once a house that was overrun with M...   
3  There was once a Dog who used to snap at peopl...   
4  There was once a Charcoal-burner who lived and...   

                            moral  
0                            None  
1  Much wants more and loses all.  
2                            None  
3                            None  
4                            None  


In [12]:
print(f"\nTotal fables: {len(df)}")

print(f"\nText length statistics:")
df['text_length'] = df['text'].str.len()
print(f"  Mean: {df['text_length'].mean():.0f} characters")
print(f"  Median: {df['text_length'].median():.0f} characters")
print(f"  Min: {df['text_length'].min():.0f} characters")
print(f"  Max: {df['text_length'].max():.0f} characters")

print(f"\nSample fables:")
for i in [0, len(df)//2, len(df)-1]:
    print(f"\n{df.iloc[i]['fable_number']}. {df.iloc[i]['title']}")
    print(f"   Length: {df.iloc[i]['text_length']} chars")


Total fables: 284

Text length statistics:
  Mean: 655 characters
  Median: 592 characters
  Min: 180 characters
  Max: 2520 characters

Sample fables:

1. THE FOX AND THE GRAPES
   Length: 394 chars

143. THE WOLF, THE FOX, AND THE APE
   Length: 407 chars

284. THE TRAVELLER AND FORTUNE
   Length: 417 chars


## Step 7: Save to CSV

In [13]:
# Drop the text_length column (just for statistics)
df_clean = df.drop(columns=['text_length'])

# Save to CSV
output_path = 'processedDataEnglish/aesops_fables_cleaned.csv'
df_clean.to_csv(output_path, index=False, encoding='utf-8')

print(f"Saved to {output_path}")
print(f"\nFile info:")
print(f"  Rows: {len(df_clean)}")
print(f"  Columns: {df_clean.columns.tolist()}")

Saved to processedDataEnglish/aesops_fables_cleaned.csv

File info:
  Rows: 284
  Columns: ['fable_number', 'title', 'text', 'moral']


# Parse Hans Christian Andersen Fairy Tales


In [14]:
# Read the file
with open('dataEnglish/HansChristianAndersenFairyTales.txt', 'r', encoding='utf-8') as f:
    raw_text_hca = f.read()

print(f"File loaded: {len(raw_text_hca)} characters")

print(raw_text_hca[:800])


File loaded: 2015441 characters
﻿The Project Gutenberg eBook of Fairy Tales of Hans Christian Andersen
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Fairy Tales of Hans Christian Andersen

Author: H. C. Andersen

Release date: November 8, 2008 [eBook #27200]
                Most recently updated: January 4, 2021

Language: English

Credits: Produced by Al Haines


*** START OF THE PROJECT GUTENBERG EBOOK FAIRY TALES OF


In [15]:
def parse_andersen_tales(text):
    """
    Parse Hans Christian Andersen Fairy Tales text into structured format.
    
    Returns: list of dicts with {title, text, moral}
    """
    lines = text.split('\n')
    
    stories = []
    current_title = None
    current_text = []
    
    # Skip header - find where CONTENTS ends
    start_idx = 0
    in_contents = False
    for i, line in enumerate(lines):
        if "CONTENTS" in line:
            in_contents = True
        elif in_contents and line.strip() and not line.startswith(' '):
            # Look for first story title (ALL CAPS, standalone line)
            if line.strip().isupper() and len(line.strip().split()) >= 1:
                # Check if this is actually a story title (not part of contents)
                # Story titles appear after multiple blank lines
                if i > 0 and lines[i-1].strip() == '':
                    start_idx = i
                    break
    
    i = start_idx
    while i < len(lines):
        line = lines[i].strip()
        
        # Stop if we've reached the Project Gutenberg end marker
        if "*** END OF THE PROJECT GUTENBERG" in line or "END OF THE PROJECT GUTENBERG EBOOK" in line:
            break
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Check if this is a title (ALL CAPS, standalone line, multiple words preferred)
        # Titles are typically 2+ words but some may be single word
        is_title = (line.isupper() and 
                   len(line) > 3 and  # Avoid very short lines
                   (len(line.split()) >= 1))  # At least one word
        
        # Additional check: titles often appear after blank lines
        # and before story text (mixed case)
        if is_title:
            # Check if next non-empty line is not all caps (is story text)
            next_text_idx = i + 1
            while next_text_idx < len(lines) and not lines[next_text_idx].strip():
                next_text_idx += 1
            
            if next_text_idx < len(lines):
                next_line = lines[next_text_idx].strip()
                # If next line is not all caps, this is likely a title
                if next_line and not next_line.isupper() and len(next_line) > 20:
                    # Save previous story if exists
                    if current_title and current_text:
                        story_text = ' '.join(current_text)
                        if story_text.strip():  # Only save if there's actual text
                            stories.append({
                                'title': current_title,
                                'text': story_text,
                                'moral': None
                            })
                    
                    # Start new story
                    current_title = line
                    current_text = []
        
        # Regular text line (part of story)
        elif current_title:
            # Stop if we hit what looks like another title
            if not line.isupper() or len(line.split()) < 2:
                current_text.append(line)
        
        i += 1
    
    # Save last story
    if current_title and current_text:
        story_text = ' '.join(current_text)
        if story_text.strip():
            stories.append({
                'title': current_title,
                'text': story_text,
                'moral': None
            })
    
    return stories

# Parse the tales
stories_hca = parse_andersen_tales(raw_text_hca)
print(f"\nParsed {len(stories_hca)} stories")

print(f"Title: {stories_hca[0]['title']}")
print(f"Text: {stories_hca[0]['text'][:200]}...")



Parsed 202 stories
Title: A STORY
Text: In the garden all the apple-trees were in blossom.  They had hastened to bring forth flowers before they got green leaves, and in the yard all the ducklings walked up and down, and the cat too: it bas...


In [16]:
# Clean all stories
for story in stories_hca:
    story['title'] = clean_text(story['title'])
    story['text'] = clean_text(story['text'])


In [18]:
# Create DataFrame and save
df_hca = pd.DataFrame(stories_hca)
df_hca.insert(0, 'story_number', range(1, len(df_hca) + 1))
df_hca = df_hca[['story_number', 'title', 'text', 'moral']]

output_path_hca = 'processedDataEnglish/hans_christian_andersen_cleaned.csv'
df_hca.to_csv(output_path_hca, index=False, encoding='utf-8')

print(f"Saved to {output_path_hca}")
print(f"Rows: {len(df_hca)}")


Saved to processedDataEnglish/hans_christian_andersen_cleaned.csv
Rows: 202


In [19]:
# Read the file
with open('dataEnglish/HouseholdTalesBrosGrimm.txt', 'r', encoding='utf-8') as f:
    raw_text_grimm = f.read()

print(f"File loaded: {len(raw_text_grimm)} characters")

print(raw_text_grimm[:800])


File loaded: 1500438 characters
﻿The Project Gutenberg eBook of Household Tales by Brothers Grimm
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Household Tales by Brothers Grimm

Author: Jacob Grimm
        Wilhelm Grimm

Translator: Mrs. Alfred William Hunt

Release date: March 1, 2004 [eBook #5314]
                Most recently updated: March 1, 2022

Language: English

Credits: Veronica LeGrow, Katie Nicholson, Erin


## Parse Grimm's Tales


In [20]:
def parse_grimm_tales(text):
    """
    Parse Grimm's Household Tales text into structured format.
    There are 200 tales and 10 legends.
    
    Returns: list of dicts with {title, text, moral}
    """
    lines = text.split('\n')
    
    tales = []
    current_title = None
    current_text = []
    
    # Skip header - find where first actual tale starts (format: "1 The Frog-King")
    # Need to skip the table of contents which also has numbered entries
    start_idx = 0
    for i, line in enumerate(lines):
        # Look for pattern "1 The Frog-King" - number, space, capital letter, and it's followed by empty line
        stripped = line.strip()
        if re.match(r'^1\s+[A-Z]', stripped):
            # Check if next line is empty (actual tale starts), not part of table of contents
            if i + 1 < len(lines) and not lines[i + 1].strip():
                start_idx = i
                break
    
    i = start_idx
    while i < len(lines):
        line = lines[i].strip()
        
        # Stop if we've reached the Project Gutenberg end marker
        if "*** END OF THE PROJECT GUTENBERG" in line or "END OF THE PROJECT GUTENBERG EBOOK" in line:
            break
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Check if this is a new tale (format: "1 Title" or "1. Title")
        tale_match = re.match(r'^(\d+)[.\s]+(.+)$', line)
        if tale_match:
            # Save previous tale if exists
            if current_title and current_text:
                tale_text = ' '.join(current_text)
                if tale_text.strip():
                    tales.append({
                        'title': current_title,
                        'text': tale_text,
                        'moral': None
                    })
            
            # Start new tale - extract title (remove number and period/space)
            current_title = tale_match.group(2).strip()
            current_text = []
        
        # Check if this is a legend (format: "Legend 1 Title")
        elif re.match(r'^Legend\s+\d+', line, re.IGNORECASE):
            # Save previous tale if exists
            if current_title and current_text:
                tale_text = ' '.join(current_text)
                if tale_text.strip():
                    tales.append({
                        'title': current_title,
                        'text': tale_text,
                        'moral': None
                    })
            
            # Extract legend title - format "Legend 1 St. Joseph in the Forest"
            legend_match = re.match(r'^Legend\s+\d+\s+(.+)$', line, re.IGNORECASE)
            if legend_match:
                current_title = legend_match.group(1).strip()
            else:
                current_title = line
            current_text = []
        
        # Check for "Children's Legends" header - skip it
        elif "Children's Legends" in line or "Children's legends" in line:
            # Save previous tale if exists
            if current_title and current_text:
                tale_text = ' '.join(current_text)
                if tale_text.strip():
                    tales.append({
                        'title': current_title,
                        'text': tale_text,
                        'moral': None
                    })
            current_title = None
            current_text = []
        
        # Regular text line (part of story)
        elif current_title:
            current_text.append(line)
        
        i += 1
    
    # Save last tale/legend
    if current_title and current_text:
        tale_text = ' '.join(current_text)
        if tale_text.strip():
            tales.append({
                'title': current_title,
                'text': tale_text,
                'moral': None
            })
    
    return tales

# Parse the tales
tales_grimm = parse_grimm_tales(raw_text_grimm)
print(f"\nParsed {len(tales_grimm)} tales")

print(f"Title: {tales_grimm[0]['title']}")
print(f"Text: {tales_grimm[0]['text'][:200]}...")



Parsed 210 tales
Title: The Frog-King, or Iron Henry
Text: In old times when wishing still helped one, there lived a king whose daughters were all beautiful, but the youngest was so beautiful that the sun itself, which has seen so much, was astonished wheneve...


In [21]:
# Clean all tales
for tale in tales_grimm:
    tale['title'] = clean_text(tale['title'])
    tale['text'] = clean_text(tale['text'])


# Create DataFrame and save
df_grimm = pd.DataFrame(tales_grimm)
df_grimm.insert(0, 'story_number', range(1, len(df_grimm) + 1))
df_grimm = df_grimm[['story_number', 'title', 'text', 'moral']]

output_path_grimm = 'processedDataEnglish/grimm_tales_cleaned.csv'
df_grimm.to_csv(output_path_grimm, index=False, encoding='utf-8')

print(f"Saved to {output_path_grimm}")
print(f"Rows: {len(df_grimm)}")


Saved to processedDataEnglish/grimm_tales_cleaned.csv
Rows: 210


In [None]:
# Read the file
with open('dataEnglish/JustSoStoriesReed.txt', 'r', encoding='utf-8') as f:
    raw_text_justso = f.read()

print(f"File loaded: {len(raw_text_justso)} characters")

print(raw_text_justso[:800])


File loaded: 177525 characters

First 800 characters:
﻿The Project Gutenberg eBook of Just so stories
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Just so stories

Author: Rudyard Kipling

Release date: August 1, 2001 [eBook #2781]
                Most recently updated: October 7, 2016

Language: English

Credits: Produced by David Reed


*** START OF THE PROJECT GUTENBERG EBOOK JUST SO STORIES ***




Produced by David Reed





JUST SO S


## Parse Just So Stories


In [None]:
def parse_just_so_stories(text):
    """
    Parse Just So Stories text into structured format.
    There are 12 stories.
    
    Returns: list of dicts with {title, text, moral}
    """
    lines = text.split('\n')
    
    stories = []
    current_title = None
    current_text = []
    
    # Known story titles from table of contents (12 stories)
    known_titles = [
        "HOW THE WHALE GOT HIS THROAT",
        "HOW THE CAMEL GOT HIS HUMP",
        "HOW THE RHINOCEROS GOT HIS SKIN",
        "HOW THE LEOPARD GOT HIS SPOTS",
        "THE ELEPHANT'S CHILD",
        "THE SING-SONG OF OLD MAN KANGAROO",
        "THE BEGINNING OF THE ARMADILLOS",
        "HOW THE FIRST LETTER WAS WRITTEN",
        "HOW THE ALPHABET WAS MADE",
        "THE CRAB THAT PLAYED WITH THE SEA",
        "THE CAT THAT WALKED BY HIMSELF",
        "THE BUTTERFLY THAT STAMPED"
    ]
    
    # Skip header - find where first actual story starts (after table of contents)
    start_idx = 0
    for i, line in enumerate(lines):
        stripped = line.strip()
        # Look for first known title
        if stripped in known_titles:
            # Check if previous line is blank (title format)
            if i > 0 and lines[i-1].strip() == '':
                start_idx = i
                break
    
    i = start_idx
    while i < len(lines):
        line = lines[i].strip()
        
        # Stop if we've reached the Project Gutenberg end marker
        if "*** END OF THE PROJECT GUTENBERG" in line or "END OF THE PROJECT GUTENBERG EBOOK" in line:
            break
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Check if this is a known story title (use exact match to avoid poems)
        if line in known_titles:
            # Save previous story if exists
            if current_title and current_text:
                story_text = ' '.join(current_text)
                if story_text.strip():
                    stories.append({
                        'title': current_title,
                        'text': story_text,
                        'moral': None
                    })
            
            # Start new story
            current_title = line
            current_text = []
        
        # Regular text line (part of story)
        elif current_title:
            current_text.append(line)
        
        i += 1
    
    # Save last story
    if current_title and current_text:
        story_text = ' '.join(current_text)
        if story_text.strip():
            stories.append({
                'title': current_title,
                'text': story_text,
                'moral': None
            })
    
    return stories

# Parse the stories
stories_justso = parse_just_so_stories(raw_text_justso)
print(f"\nParsed {len(stories_justso)} stories")

print(f"Title: {stories_justso[0]['title']}")
print(f"Text: {stories_justso[0]['text'][:200]}...")



Parsed 12 stories

First story:
Title: HOW THE LEOPARD GOT HIS SPOTS
Text: THE ELEPHANT’S CHILD...


In [None]:
# Clean all stories
for story in stories_justso:
    story['title'] = clean_text(story['title'])
    story['text'] = clean_text(story['text'])


# Create DataFrame and save
df_justso = pd.DataFrame(stories_justso)
df_justso.insert(0, 'story_number', range(1, len(df_justso) + 1))
df_justso = df_justso[['story_number', 'title', 'text', 'moral']]

output_path_justso = 'processedDataEnglish/just_so_stories_cleaned.csv'
df_justso.to_csv(output_path_justso, index=False, encoding='utf-8')

print(f"Saved to {output_path_justso}")
print(f"Rows: {len(df_justso)}")


  Saved to processedDataEnglish/just_so_stories_cleaned.csv
  Rows: 13


---

# 5. Morals and Manners (Bates)


In [22]:
# Read the file
with open('dataEnglish/MoralandMannersBates.txt', 'r', encoding='utf-8') as f:
    raw_text_bates = f.read()

print(f"File loaded: {len(raw_text_bates)} characters")

print(raw_text_bates[:1000])


File loaded: 238103 characters
﻿The Project Gutenberg eBook of Story Lessons on Character-Building (Morals) and Manners
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Story Lessons on Character-Building (Morals) and Manners

Author: Loïs Bates

Release date: November 3, 2010 [eBook #34200]
                Most recently updated: January 7, 2021

Language: English

Credits: Produced by Emmy, Darleen Dove and the Online Distributed
        Proofreading Team at https://www.pgdp.net (This file was
        produced from images generously made available by The
        Internet Archive)


***

## Parse Morals and Manners


In [23]:
# Parse the stories

def parse_bates_stories(text):
    """
    Parse Morals and Manners text into structured format.
    There should be 131 stories total, with the last being "How another Queen Builded".
    
    Story formats:
    - "123. Title Name." - story with title
    - "96." - story without title (just number)
    Section headers: "I. SECTION NAME." (Roman numerals) - skip these
    Morals: "(MORAL NAME.)" in parentheses
    
    Returns: list of dicts with {title, text, moral}
    """
    lines = text.split('\n')
    
    stories = []
    current_title = None
    current_text = []
    current_moral = None
    current_story_num = None
    pending_moral = None
    
    # Skip header
    start_idx = 0
    found_moral_section = False
    for i, line in enumerate(lines):
        if "1.--MORAL SUBJECTS." in line or "I. INTRODUCTORY STORY." in line:
            found_moral_section = True
            continue
        # Look for first numbered story (e.g., "1. The Fairy Temple." or "96.")
        if found_moral_section and re.match(r'^\d+\.', line.strip()):
            start_idx = i
            break
    
    i = start_idx
    while i < len(lines):
        line = lines[i].strip()
        orig_line = lines[i]  # Keep original for context
        
        # Stop if we've reached the end marker
        if "*** END OF THE PROJECT GUTENBERG" in line or "END OF THE PROJECT GUTENBERG EBOOK" in line:
            break
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Pattern: Roman numeral starting with L, followed by period, space, then ALL CAPS text
        roman_moral_match = re.match(r'^L[A-Z]*\.\s+([A-Z][A-Z\s\',\-\"\.]+)\.?\s*$', line)
        if roman_moral_match:
            # This is a moral header (e.g., "LXI. GOING IN FRONT OF PEOPLE.")
            # Extract the moral text (everything after the Roman numeral and period)
            moral_text = roman_moral_match.group(1).strip()
            # Remove trailing period if present
            if moral_text.endswith('.'):
                moral_text = moral_text[:-1]
            pending_moral = moral_text
            i += 1
            continue
        
        # Check for major section headers
        if re.match(r'^[IVX]{1,3}\.\s+[A-Z]', line):
            # Save previous story if exists before section header
            if current_story_num is not None and current_text:
                story_text = ' '.join(current_text)
                if story_text.strip():
                    stories.append({
                        'title': current_title if current_title else "Story Lesson",
                        'text': story_text,
                        'moral': current_moral
                    })
            current_title = None
            current_text = []
            current_moral = None
            current_story_num = None
            pending_moral = None  # Clear pending moral when we hit a major section
            i += 1
            continue
        
        # Check if this is a new story
        story_match_no_title = re.match(r'^(\d+)\.\s*$', line)
        story_match = re.match(r'^(\d+)\.\s+(.+?)(?:\.\s*$|$)', line)
        
        if story_match_no_title:
            # Story without title
            story_num = int(story_match_no_title.group(1))
            
            is_new_story = (current_story_num is None) or (story_num >= current_story_num)
            
            if is_new_story and 1 <= story_num <= 131:
                # Save previous story if exists
                if current_story_num is not None and current_text:
                    story_text = ' '.join(current_text)
                    if story_text.strip():
                        stories.append({
                            'title': current_title if current_title else "Story Lesson",
                            'text': story_text,
                            'moral': current_moral
                        })
                
                # Start new story without title
                # Use pending_moral if available (from Roman numeral header)
                current_story_num = story_num
                current_title = None
                current_text = []
                current_moral = pending_moral  # Set moral from Roman numeral header if present
                pending_moral = None  # Clear after using
                i += 1
                continue
            else:
                current_text.append(line)
                i += 1
                continue
        elif story_match:
            # Story with title
            story_num = int(story_match.group(1))
            
            is_new_story = (current_story_num is None) or (story_num >= current_story_num)
            
            if is_new_story and 1 <= story_num <= 131:
                # Save previous story if exists
                if current_story_num is not None and current_text:
                    story_text = ' '.join(current_text)
                    if story_text.strip():
                        stories.append({
                            'title': current_title if current_title else "Story Lesson",
                            'text': story_text,
                            'moral': current_moral
                        })
                
                # Start new story - extract title
                title_part = story_match.group(2).strip()
                if title_part:
                    # Remove trailing period if present
                    if title_part.endswith('.'):
                        title_part = title_part[:-1]
                    current_title = title_part
                else:
                    current_title = None
                
                current_story_num = story_num
                current_text = []
                # Use pending_moral if available (from Roman numeral header), otherwise keep existing logic
                current_moral = pending_moral if pending_moral else None
                pending_moral = None  # Clear after using
                i += 1
                continue
            else:
                # This is numbered content within the current story
                current_text.append(line)
                i += 1
                continue
        
        moral_match = re.match(r'^\(([A-Z][^)]+)\)\.?\s*$', line)
        if moral_match and current_story_num is not None:
            # Extract moral, remove trailing period if present
            moral_text = moral_match.group(1).strip()
            if moral_text.endswith('.'):
                moral_text = moral_text[:-1]
            if 'BLACKBOARD' not in moral_text.upper() and not current_moral:
                current_moral = moral_text
            i += 1
            continue
        
        # Regular text line (part of story)
        if current_story_num is not None or current_text:
            # Skip blackboard labels and footnotes
            if line.startswith('(Blackboard') or line.startswith('FOOTNOTES') or line.startswith('FOOTNOTE'):
                i += 1
                continue
            # Skip footnote markers like "[18]" if they're standalone
            if re.match(r'^\[\d+\]\s*$', line):
                i += 1
                continue
            if line.startswith('_Note_') or line.startswith('LIST OF') or line.startswith('Transcriber'):
                i += 1
                continue
            
            # Include all text lines - numbered bullets within stories are part of the story
            current_text.append(line)
        
        i += 1
    
    # Save last story
    if current_story_num is not None and current_text:
        story_text = ' '.join(current_text)
        if story_text.strip():
            stories.append({
                'title': current_title if current_title else "Story Lesson",
                'text': story_text,
                'moral': current_moral
            })
    
    return stories


stories_bates = parse_bates_stories(raw_text_bates)
print(f"\nParsed {len(stories_bates)} stories")
if len(stories_bates) > 0:
    print(f"Title: {stories_bates[0]['title']}")
    print(f"Text: {stories_bates[0]['text'][:200]}...")



Parsed 131 stories
Title: The Fairy Temple
Text: (The following story should be read to the children =first=, as it forms a kind of groundwork for the Story Lessons which follow.) It was night--a glorious, moonlight night, and in the shade of the le...


In [24]:
# Clean all stories
for story in stories_bates:
    # Ensure title exists
    if story.get('title') is None:
        story['title'] = "Story Lesson"
    story['title'] = clean_text(story['title'])
    story['text'] = clean_text(story['text'])


# Create DataFrame and save
df_bates = pd.DataFrame(stories_bates)
df_bates.insert(0, 'story_number', range(1, len(df_bates) + 1))
df_bates = df_bates[['story_number', 'title', 'text', 'moral']]

output_path_bates = 'processedDataEnglish/morals_manners_cleaned.csv'
df_bates.to_csv(output_path_bates, index=False, encoding='utf-8')

print(f"Saved to {output_path_bates}")
print(f"Rows: {len(df_bates)}")

Saved to processedDataEnglish/morals_manners_cleaned.csv
Rows: 131


In [25]:
# Read the file
with open('dataEnglish/PeterRabbit.txt', 'r', encoding='utf-8') as f:
    raw_text_peter = f.read()

print(f"File loaded: {len(raw_text_peter)} characters")
print(f"\nFirst 800 characters:")
print(raw_text_peter[:800])


File loaded: 25216 characters

First 800 characters:
﻿The Project Gutenberg eBook of The Tale of Peter Rabbit
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Tale of Peter Rabbit

Author: Beatrix Potter

Release date: January 30, 2005 [eBook #14838]
                Most recently updated: September 8, 2021

Language: English

Credits: Robert Cicconetti, Ronald Holder and the PG Online Distributed Proofreading Team


*** START OF THE PROJE


## Parse Peter Rabbit

In [26]:
def parse_peter_rabbit(text):
    """
    Parse Peter Rabbit text into structured format.
    
    Returns: list with single dict {title, text, moral}
    """
    lines = text.split('\n')
    
    # Find the title
    title = "THE TALE OF PETER RABBIT"
    story_text = []
    
    # Skip header - find where story actually starts
    start_idx = 0
    found_title = False
    for i, line in enumerate(lines):
        if "THE TALE OF" in line.upper() or "PETER RABBIT" in line.upper():
            found_title = True
            continue
        # Start collecting text after title and initial formatting
        if found_title and line.strip() and "[Illustration]" not in line:
            if "Once upon a time" in line or "there were" in line.lower():
                start_idx = i
                break
    
    # Collect story text
    for i in range(start_idx, len(lines)):
        line = lines[i].strip()
        
        # Skip illustration markers and empty lines at start/end
        if "[Illustration]" in line:
            continue
        if not line:
            continue
        
        # Stop if we hit Project Gutenberg footer
        if "END OF THE PROJECT GUTENBERG" in line or "*** END" in line:
            break
        
        story_text.append(line)
    
    # Join and clean
    full_text = ' '.join(story_text)
    
    return [{
        'title': title,
        'text': full_text,
        'moral': None
    }]

# Parse the story
stories_peter = parse_peter_rabbit(raw_text_peter)
print(f"\nParsed {len(stories_peter)} story")

print(f"Title: {stories_peter[0]['title']}")
print(f"Text: {stories_peter[0]['text'][:200]}...")



Parsed 1 story
Title: THE TALE OF PETER RABBIT
Text: Once upon a time there were four little Rabbits, and their names were-- Flopsy, Mopsy, Cotton-tail, and Peter. They lived with their Mother in a sand-bank, underneath the root of a very big fir-tree. ...


In [27]:
# Clean the story
for story in stories_peter:
    story['title'] = clean_text(story['title'])
    story['text'] = clean_text(story['text'])


# Create DataFrame and save
df_peter = pd.DataFrame(stories_peter)
df_peter.insert(0, 'story_number', range(1, len(df_peter) + 1))
df_peter = df_peter[['story_number', 'title', 'text', 'moral']]

output_path_peter = 'processedDataEnglish/peter_rabbit_cleaned.csv'
df_peter.to_csv(output_path_peter, index=False, encoding='utf-8')

print(f"Saved to {output_path_peter}")
print(f"Rows: {len(df_peter)}")


Saved to processedDataEnglish/peter_rabbit_cleaned.csv
Rows: 1
