In [None]:
import string

# This cleans the text and makes it ready for tokenization
def preprocess_text(text):
    # List of common stop words to remove
    stop_words = set(["a", "an", "the", "and", "or", "in", "on", "with", "for", "is", "of", "to", "as", "by"])

    # Convert text to lowercase
    text = text.lower()

    # Remove 's and other apostrophes
    text = text.replace("’s", '').replace("‘", '').replace("’", '').replace("'s", '').replace("'", '').replace("'", '')

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Split text into words and remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join words back into string and remove extra spaces
    text = ' '.join(words).strip()

    return text


In [None]:
# Pre-processing for Grimm's Fairy Tales, only needs to be run if stories are not yet split

import os

def split_stories(input_file, output_dir):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    story = []
    title = None
    for line in lines:
        if line.isupper():
            if story and title:
                with open(os.path.join(output_dir, title + '.txt'), 'a', encoding='utf-8') as f:
                    f.write(''.join(story))
            title = line.strip()
            story = []
        else:
            story.append(line)

    # Save the last story
    if story and title:
        with open(os.path.join(output_dir, title + '.txt'), 'a', encoding='utf-8') as f:
            f.write(''.join(story))

# Usage
filepath = 'datasets/grimms/raw_data/grimms.txt'
output_directory = 'datasets/grimms/split'

split_stories(filepath, output_directory)


In [None]:
# import os

input_directory = 'datasets/grimms/split'
preprocessed_directory = 'datasets/grimms/preprocessed'

# Check if the preprocessed directory exists, if not, create it
if not os.path.exists(preprocessed_directory):
    os.makedirs(preprocessed_directory)

# Iterate through files in the input directory to preprocess and save texts
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(input_directory, filename)

        # Read and preprocess the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            original_text = file.read()
            preprocessed_text = preprocess_text(original_text)

        # Save the preprocessed text
        preprocessed_file_path = os.path.join(preprocessed_directory, filename)
        with open(preprocessed_file_path, 'w', encoding='utf-8') as preprocessed_file:
            preprocessed_file.write(preprocessed_text)
