# Scraping from Flash Fiction Online

In [9]:
import os
import requests
from bs4 import BeautifulSoup
import re

# List of categories
category_list = [
    'classic-flash', 
    'fantasy', 
    'horror', 
    'humor', 
    'literary', 
    'mainstream', 
    'science-fiction'
]

# Set the base URL
base_url = 'https://www.flashfictiononline.com/article-categories/'


def get_story_links(page_url):
    """
    Get the links from the category page.
    """
    
    # Get the response
    try:
        response = requests.get(page_url)
        response.raise_for_status()  
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {page_url}: {e}")
        return []
    
    # Find the article division
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article')

    # Get the links from the division
    story_links = []
    for article in articles:
        figure = article.find('figure', class_='post-image')
        if figure:
            a_tag = figure.find('a')
            if a_tag and 'href' in a_tag.attrs:
                story_links.append(a_tag['href'])

    return story_links


def get_html(url):
    """
    Get the html texts.
    """
    response = requests.get(url)
    return response.text if response.status_code == 200 else ''


def get_story_details(html_content):
    """
    Get the story text in the html contents.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    title_tag = soup.find('span', class_='main-head')
    title = title_tag.get_text(strip=True) if title_tag else 'Title not found'

    story_div = soup.find('div', class_='module module-post-content tb_iy83113')
    if story_div:
        paragraphs = story_div.find_all('p')
        story_content = '\n\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
        story_content = story_content.split("Share this")[0].strip()
    else:
        story_content = 'Story content not found'

    return title, story_content


def main():
    """
    Get stories from the base urls and download txts.
    """
    for category in category_list:
        
        # Create directory structure if it doesn't exist
        dir_path = f'./flash-fiction-online/{category}'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        # Iterate over pages
        for page in range(1, 10):
            page_url = f'{base_url}{category}/page/{page}/'
            story_links = get_story_links(page_url)

            for link in story_links:
                html_content = get_html(link)
                title, story_content = get_story_details(html_content)

                formatted_title = title.replace(' ', '-').lower()
                file_path = os.path.join(dir_path, f'{formatted_title}.txt')

                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(f"Title: {title}\n\n{story_content}")

                print(f"Saved: {file_path}")

                
if __name__ == '__main__':
    main()

Saved: ./flash-fiction/fantasy/ursula-the-monster.txt
Saved: ./flash-fiction/fantasy/saint-woad-and-sister-welwitshcia.txt
Saved: ./flash-fiction/fantasy/the-constellations-of-daughter-death.txt
Saved: ./flash-fiction/fantasy/to-slay-a-goblin.txt
Saved: ./flash-fiction/fantasy/little-fish,-big-fish.txt
Saved: ./flash-fiction/fantasy/nancy-shreds-the-clouds.txt
Saved: ./flash-fiction/fantasy/when-the-forest-comes-to-you.txt
Saved: ./flash-fiction/fantasy/lapis-lazuli.txt
Saved: ./flash-fiction/fantasy/to-rise,-to-set.txt
Saved: ./flash-fiction/fantasy/a-tiger-in-eden.txt
Saved: ./flash-fiction/fantasy/the-fox-spirit’s-retelling.txt
Saved: ./flash-fiction/fantasy/we-are-not-phoenixes.txt
Saved: ./flash-fiction/fantasy/gently-creaking-boards.txt
Saved: ./flash-fiction/fantasy/fae-magic-on-a-friday-night.txt
Saved: ./flash-fiction/fantasy/upon-what-soil-they-fed.txt
Saved: ./flash-fiction/fantasy/wonderful-wounds-await-you.txt
Saved: ./flash-fiction/fantasy/power-is-love-in-the-devil’s-eye

Saved: ./flash-fiction/humor/the-black-clover-equation.txt
Saved: ./flash-fiction/humor/foreign-tongues.txt
Saved: ./flash-fiction/humor/space-travel-loses-its-allure-when-you’ve-lost-your-moon-cup.txt
Saved: ./flash-fiction/humor/a-note-to-parents-regarding-the-beginning-and-end-of-time-diorama-presentations-for-ms.-miller’s-third-grade-class.txt
Saved: ./flash-fiction/humor/i-am-graalnak-of-the-vroon-empire,-destroyer-of-galaxies,-supreme-overlord-of-the-planet-earth.-ask-me-anything..txt
Saved: ./flash-fiction/humor/my-superpower.txt
Saved: ./flash-fiction/humor/irma-splinkbottom’s-recipe-for-cold-fusion.txt
Saved: ./flash-fiction/humor/last-bites.txt
Saved: ./flash-fiction/humor/pêlos.txt
Saved: ./flash-fiction/humor/zigzag-strikes-again.txt
Saved: ./flash-fiction/humor/the-numbers-game.txt
Saved: ./flash-fiction/humor/the-dragonslayer.txt
Saved: ./flash-fiction/humor/caps-lock-and-the-ellipsis-of-doom.txt
Error accessing https://www.flashfictiononline.com/article-categories/humor/

Error accessing https://www.flashfictiononline.com/article-categories/mainstream/page/9/: 404 Client Error: Not Found for url: https://www.flashfictiononline.com/article-categories/mainstream/page/9/
Saved: ./flash-fiction/science-fiction/in-search-of-body.txt
Saved: ./flash-fiction/science-fiction/the-first-day-of-the-week.txt
Saved: ./flash-fiction/science-fiction/grandma’s-sex-robot.txt
Saved: ./flash-fiction/science-fiction/quantum-love.txt
Saved: ./flash-fiction/science-fiction/grin-minus-cat.txt
Saved: ./flash-fiction/science-fiction/cruise-control.txt
Saved: ./flash-fiction/science-fiction/it-begins-with-raven.txt
Saved: ./flash-fiction/science-fiction/lost-and-found-at-the-center-of-the-universe.txt
Saved: ./flash-fiction/science-fiction/how-they-name-the-ships.txt
Saved: ./flash-fiction/science-fiction/eight-reasons-you-are-alone.txt
Saved: ./flash-fiction/science-fiction/on-the-anniversary-of-your-passing.txt
Saved: ./flash-fiction/science-fiction/the-perfect-brick-of-feta.tx

# Scraping from Flash Fiction Library

In [12]:
import os
import requests
from bs4 import BeautifulSoup


# List of categories
category_list = [
    'fantasy', 
    'uncategorized', 
    'horror', 
    'romance', 
    'scifi', 
    'science-fiction'
]


# Base URL
base_url = 'https://flashfictionlibrary.com/category/'


def get_story_links(category_url):
    """
    Get the story link from the home url.
    """
    
    # Get the response
    try:
        response = requests.get(category_url)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {category_url}: {e}")
        return []

    # Parse the html file
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article')

    # Get the links from the header of the article section
    story_links = []
    for article in articles:
        header = article.find('header', class_='entry-header')
        if header:
            a_tag = header.find('a')
            if a_tag and 'href' in a_tag.attrs:
                story_links.append(a_tag['href'])

    return story_links


def get_story_content(url):
    """
    Get story content from the url.
    """
    
    # Get the response
    response = requests.get(url)
    if response.status_code != 200:
        return 'Story content not found', ''
    
    # Parse the html file
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get the title
    title_tag = soup.find('title')
    title = title_tag.get_text(strip=True).split(' – ')[0] if title_tag else 'Title not found'
    
    # Find the story content
    story_div = soup.find('article')
    if story_div:
        paragraphs = story_div.find_all('p')
        story_content = '\n\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
    else:
        story_content = 'Story content not found'

    return title, story_content


def main():
    """
    Get stories from the base urls and download txts.
    """
    for category in category_list:
        dir_path = f'./flash-fiction-library/{category}'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        for page in range(1, 11):  # Iterate over pages 1 to 10
            page_url = f'{base_url}{category}/page/{page}/'
            story_links = get_story_links(page_url)

            for link in story_links:
                title, story_content = get_story_content(link)
                formatted_title = title.replace(' ', '-').lower()
                file_path = os.path.join(dir_path, f'{formatted_title}.txt')

                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(f"Title: {title}\n\n{story_content}")

                print(f"Saved: {file_path}")

                
if __name__ == '__main__':
    main()


Saved: ./flash-fiction-library/uncategorized/the-last-corporate.txt
Saved: ./flash-fiction-library/uncategorized/the-many-faces-of-sophia-morrow.txt
Saved: ./flash-fiction-library/uncategorized/the-ethereal-form-of-fairies.txt
Saved: ./flash-fiction-library/uncategorized/the-aeonian-ball.txt
Saved: ./flash-fiction-library/uncategorized/autumn-leaves.txt
Saved: ./flash-fiction-library/uncategorized/the-big-black-bird.txt
Saved: ./flash-fiction-library/uncategorized/the-weaving-woman.txt
Saved: ./flash-fiction-library/uncategorized/the-necromancer.txt
Saved: ./flash-fiction-library/uncategorized/picture-in-the-locket.txt
Saved: ./flash-fiction-library/uncategorized/those-that-live-longest.txt
Saved: ./flash-fiction-library/uncategorized/the-cost-of-divinity.txt
Saved: ./flash-fiction-library/uncategorized/astronought.txt
Saved: ./flash-fiction-library/uncategorized/beginning-&-end.txt
Saved: ./flash-fiction-library/uncategorized/when-you-look-away.txt
Saved: ./flash-fiction-library/uncat

Saved: ./flash-fiction-library/horror/mixed-signals.txt
Saved: ./flash-fiction-library/horror/being-in-the-mist.txt
Saved: ./flash-fiction-library/horror/pillars-in-the-deep.txt
Saved: ./flash-fiction-library/horror/the-mouth-in-the-wall.txt
Saved: ./flash-fiction-library/horror/court-of-the-sunflower-king.txt
Saved: ./flash-fiction-library/horror/when-death-wore-lipstick.txt
Saved: ./flash-fiction-library/horror/elysium-field.txt
Saved: ./flash-fiction-library/horror/the-beast.txt
Saved: ./flash-fiction-library/horror/the-mad-moors-of-calum.txt
Saved: ./flash-fiction-library/horror/fragile-creatures.txt
Saved: ./flash-fiction-library/horror/buying-a-soul.txt
Saved: ./flash-fiction-library/horror/me,-myself-and-the-fae.txt
Saved: ./flash-fiction-library/horror/the-old-man-and-the-stars.txt
Saved: ./flash-fiction-library/horror/miggi-island.txt
Saved: ./flash-fiction-library/horror/warriors-of-yesteryear.txt
Saved: ./flash-fiction-library/horror/the-ladies-of-llewelyn-library.txt
Saved:

Saved: ./flash-fiction-library/scifi/beast-of-burden.txt
Saved: ./flash-fiction-library/scifi/the-museum-of-selfies.txt
Saved: ./flash-fiction-library/scifi/manufacturing-stars.txt
Saved: ./flash-fiction-library/scifi/unintended-consequences.txt
Saved: ./flash-fiction-library/scifi/cold-lights.txt
Saved: ./flash-fiction-library/scifi/the-space-between-ages.txt
Saved: ./flash-fiction-library/scifi/what-you-see.txt
Saved: ./flash-fiction-library/scifi/children-of-the-cosmos.txt
Saved: ./flash-fiction-library/scifi/watcher-in-the-wastes.txt
Saved: ./flash-fiction-library/scifi/suicide-note.txt
Saved: ./flash-fiction-library/scifi/jefferson.txt
Saved: ./flash-fiction-library/scifi/the-thing-that-matters.txt
Saved: ./flash-fiction-library/scifi/tim’s-demons-&-other-friends.txt
Saved: ./flash-fiction-library/scifi/the-biologist’s-daughter.txt
Saved: ./flash-fiction-library/scifi/pillars-in-the-deep.txt
Saved: ./flash-fiction-library/scifi/elizabeth’s-sentience.txt
Saved: ./flash-fiction-libr

In [16]:
def count_txt_files(directory):
    txt_file_count = 0

    # Walk through all directories and files within the specified directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                txt_file_count += 1

    return txt_file_count


print(f"Total number of .txt files in '{dir_1}': {count_txt_files(dir_1)}")
print(f"Total number of .txt files in '{dir_2}': {count_txt_files(dir_2)}")

Total number of .txt files in './flash-fiction-library': 290
Total number of .txt files in './flash-fiction-online': 350


# Grimms

In [23]:
import os
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def extract_stories_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    stories = []
    chapters = soup.find_all('div', class_='chapter')

    for chapter in chapters:
        title_tag = chapter.find('h2')
        if title_tag:
            title = title_tag.get_text(strip=True)
            paragraphs = chapter.find_all('p')
            story_content = '\n\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
            stories.append((title, story_content))

    return stories

def save_stories(stories, directory):
    os.makedirs(directory, exist_ok=True)
    
    for title, content in stories:
        formatted_title = title.replace(' ', '-').lower() + '.txt'
        file_path = os.path.join(directory, formatted_title)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

# URL of the web page containing the stories
url = 'https://www.gutenberg.org/files/2591/2591-h/2591-h.htm#link2H_4_0001'
html_content = fetch_html(url)

if html_content:
    stories = extract_stories_from_html(html_content)
    dir_path = './fairy-tales-grimms'
    save_stories(stories, dir_path)
else:
    print("Failed to fetch the webpage.")

# Anderson

In [25]:
import os
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def extract_stories_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    stories = []
    titles = soup.find_all('h4')

    for title_tag in titles:
        links = title_tag.find_all('a')

        for link in links:
            story_title = link.get_text(strip=True)
            anchor = soup.find('a', attrs={'name': link['href'].strip('#')})
            
            if anchor:
                content_paragraphs = []
                for sibling in anchor.next_siblings:
                    if sibling.name == 'a' and sibling.has_attr('name'):
                        break
                    if sibling.name == 'p':
                        content_paragraphs.append(sibling.get_text(strip=True))

                story_content = '\n\n'.join(content_paragraphs)
                stories.append((story_title, story_content))

    return stories

def save_stories(stories, directory):
    os.makedirs(directory, exist_ok=True)
    
    for title, content in stories:
        formatted_title = title.replace(' ', '-').lower() + '.txt'
        file_path = os.path.join(directory, formatted_title)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

# URL of the web page containing the stories
url = 'https://www.gutenberg.org/files/27200/27200-h/27200-h.htm'
html_content = fetch_html(url)

if html_content:
    stories = extract_stories_from_html(html_content)
    dir_path = './fairy-tales-anderson'
    save_stories(stories, dir_path)
else:
    print("Failed to fetch the webpage.")


# Japanese

In [26]:
def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def extract_stories_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    stories = []
    # Adjust the BeautifulSoup selectors based on the actual HTML structure
    titles = soup.find_all('h2')

    for title_tag in titles:
        story_title = title_tag.get_text(strip=True)
        content_paragraphs = []

        # Iterate over following siblings until the next title or end
        for sibling in title_tag.next_siblings:
            if sibling.name == 'h2':  # Adjust if the next story title tag is different
                break
            if sibling.name == 'p':
                content_paragraphs.append(sibling.get_text(strip=True))

        story_content = '\n\n'.join(content_paragraphs)
        stories.append((story_title, story_content))

    return stories

def save_stories(stories, directory):
    os.makedirs(directory, exist_ok=True)
    
    for title, content in stories:
        # Create a valid filename from the title
        formatted_title = title.replace(' ', '-').lower() + '.txt'
        file_path = os.path.join(directory, formatted_title)
        
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)

# URL of the web page containing the stories
url = 'https://www.gutenberg.org/cache/epub/4018/pg4018-images.html'
html_content = fetch_html(url)

if html_content:
    stories = extract_stories_from_html(html_content)
    dir_path = './fairy-tales-japanese'
    save_stories(stories, dir_path)
else:
    print("Failed to fetch the webpage.")


In [27]:
def extract_stories_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    stories = []
    chapters = soup.find_all('div', class_='chapter')

    for chapter in chapters:
        title_tag = chapter.find('h2')
        if title_tag:
            title = title_tag.get_text(strip=True)
            paragraphs = chapter.find_all('p')
            story_content = '\n\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
            stories.append(f"Title: {title}\n\n{story_content}")

    return stories

def save_stories(stories, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

    for story in stories:
        title = story.split('\n\n', 1)[0].replace('Title: ', '').replace(' ', '-').lower() + '.txt'
        with open(os.path.join(directory, title), 'w', encoding='utf-8') as file:
            file.write(story)

# URL of the book
url = 'https://www.gutenberg.org/cache/epub/902/pg902-images.html'
dir_path = './fairy-tales-wilde'

# Extract and save the stories
stories = extract_stories_from_url(url)
save_stories(stories, dir_path)


In [32]:
def fetch_html(url):
    """ Fetches HTML content from the given URL. """
    response = requests.get(url)
    return response.text

def extract_chapter_links(soup):
    """ Extracts links to all chapters from the main page. """
    chapter_links = []
    for a_tag in soup.find_all('a', class_='pginternal'):
        if 'href' in a_tag.attrs:
            chapter_links.append(a_tag.attrs['href'])
    return chapter_links

def format_title(title):
    """ Formats the title with a hyphen between the chapter number and name. """
    parts = title.split(' ', 1)
    return f"{parts[0]}-{parts[1].replace(' ', '-')}"

def extract_chapter_content(soup, chapter_links):
    """ Extracts content of each chapter. """
    chapters = []
    for link in chapter_links:
        chapter_anchor = soup.find('a', {'id': link.strip('#')})
        if chapter_anchor:
            title_tag = chapter_anchor.find_next('h2')
            if title_tag:
                title = format_title(title_tag.get_text(strip=True))
                paragraphs = title_tag.find_all_next('p', limit=20) 
                story_content = '\n\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
                chapters.append({'title': title.lower(), 'content': story_content})
    return chapters

def save_chapters(chapters, directory):
    """ Saves each chapter content in a separate file. """
    if not os.path.exists(directory):
        os.makedirs(directory)

    for chapter in chapters:
        file_name = f"{chapter['title']}.txt"
        with open(os.path.join(directory, file_name), 'w', encoding='utf-8') as file:
            file.write(chapter['content'])

# URL of the book
url = 'https://www.gutenberg.org/cache/epub/58866/pg58866-images.html'

# Fetch HTML content from the URL
html_content = fetch_html(url)

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract chapter links and contents
chapter_links = extract_chapter_links(soup)
chapters = extract_chapter_content(soup, chapter_links)

# Directory to save chapters
dir_path = './gutenberg/the-murder-on-the-links-'  # Bug to be fixed

# Save the chapters
save_chapters(chapters, dir_path)


In [36]:
def fetch_text(url):
    """ Fetches text content from the given URL. """
    response = requests.get(url)
    return response.text

def split_into_chapters(text):
    """ Splits the text into chapters. """
    # Using regex to find "Chapter <number>" and split the text accordingly
    chapters = re.split(r'CHAPTER \d+', text)
    return chapters[1:]  # Skipping the first split as it usually doesn't contain chapter content

def save_chapters(chapters, directory):
    """ Saves each chapter content in a separate file. """
    if not os.path.exists(directory):
        os.makedirs(directory)

    for i, chapter in enumerate(chapters, start=1):
        file_name = f"{str(i).zfill(2)}.txt"
        with open(os.path.join(directory, file_name), 'w', encoding='utf-8') as file:
            file.write(chapter.strip())

# URL of the book
url = 'https://www.gutenberg.org/cache/epub/1557/pg1557.txt'

# Fetch text content from the URL
text_content = fetch_text(url)

# Split the text into chapters
chapters = split_into_chapters(text_content)

# Directory to save chapters
dir_path = './gutenberg/men-of-iron'

# Save the chapters
save_chapters(chapters, dir_path)


In [37]:
def download_file(url, file_path):
    response = requests.get(url)
    if response.status_code == 200:
        # Ensuring the directory exists
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"File downloaded and saved as {file_path}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")

# URL of the book
url = 'https://www.gutenberg.org/cache/epub/29579/pg29579.txt'

# Path where to save the file
file_path = './gutenberg/watchbird/watchbird.txt'

# Download and save the file
download_file(url, file_path)


File downloaded and saved as ./gutenberg/watchbird/watchbird.txt


In [40]:
url = 'https://www.gutenberg.org/cache/epub/29939/pg29939.txt'
file_path = './the-chinese-fairy-tales.txt'

In [41]:
download_file(url, file_path)

File downloaded and saved as ./the-chinese-fairy-tales.txt


In [11]:
import requests
from bs4 import BeautifulSoup
import os
from roman import toRoman, fromRoman  # You might need to install this package using 'pip install roman'

def fetch_html(url):
    """ Fetches HTML content from the given URL. """
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_chapters(soup):
    """ Extracts chapters and their content from the HTML content. """
    chapters = []
    for num in range(1, 75):  # Roman numerals from I to LXXIV
        roman_num = toRoman(num)
        chapter_header = soup.find('a', id=roman_num)
        if chapter_header:
            title_elem = chapter_header.find_next('h3')
            if title_elem:
                title = title_elem.get_text(strip=True).split('<br>')[-1].strip()
                content_parts = []
                next_elem = title_elem.find_next_sibling()
                while next_elem and (next_elem.name != 'h3' or 'padtop' not in next_elem.get('class', [])):
                    if next_elem.name in ['p', 'div'] and next_elem.get('class') in [None, ['poem']]:
                        content_parts.append(next_elem.get_text(strip=True))
                    next_elem = next_elem.find_next_sibling()
                content = '\n'.join(content_parts)
                chapters.append((roman_num, title, content))
    return chapters

def save_chapters(chapters, directory):
    """ Saves each chapter in a separate file. """
    if not os.path.exists(directory):
        os.makedirs(directory)

    for chapter in chapters:
        chapter_num = fromRoman(chapter[0])
        title = chapter[1].lower().replace(' ', '-').replace('’', '').replace('“', '').replace('”', '').replace(',', '').replace('?', '')
        file_name = f"{str(chapter_num).zfill(2)}-{title}.txt"
        with open(os.path.join(directory, file_name), 'w', encoding='utf-8') as file:
            file.write(chapter[2])

# URL of the HTML version of the book
url = 'https://www.gutenberg.org/cache/epub/29939/pg29939-images.html'

# Fetch HTML content from the URL
html_content = fetch_html(url)

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract chapters
chapters = extract_chapters(soup)

# Directory to save chapters
dir_path = './fairy-tales-chinese'

# Save the chapters
save_chapters(chapters, dir_path)


In [16]:
import re

# Function to convert Roman numeral to Arabic numeral
def roman_to_arabic(roman):
    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    arabic_num = 0
    prev_value = 0
    for letter in reversed(roman):
        value = roman_numerals[letter]
        if value < prev_value:
            arabic_num -= value
        else:
            arabic_num += value
        prev_value = value
    return arabic_num

# Function to extract stories
def extract_stories(text):
    stories = {}
    for i in range(1, 75):  # Assuming the range of chapters is from I to LXXIV
        roman_num = toRoman(i)
        next_roman_num = toRoman(i + 1) if i < 74 else "End of the book"

        # Regex to find the story based on Roman numeral
        pattern = r'(?<=\b' + re.escape(roman_num) + r'\b\n\n)(.*?)(?=\b' + re.escape(next_roman_num) + r'\b\n\n)'
        story = re.search(pattern, text, re.DOTALL)
        if story:
            stories[roman_num] = story.group(0).strip()

    return stories

# Read the clipped file content
with open('./the-chinese-fairy-tales.txt', 'r', encoding='utf-8') as file:
    text_content = file.read()

# Extract stories from the text
extracted_stories = extract_stories(text_content)

# Display the first few stories to check
for roman_num, story in list(extracted_stories.items())[:5]:
    print(f"Story {roman_num}:")
    print(story[:500])  # Displaying only the first 500 characters for brevity
    print("\n" + "-"*50 + "\n")


Story I:
WOMEN'S WORDS PART FLESH AND BLOOD


Once upon a time there were two brothers, who lived in the same house.
And the big brother listened to his wife's words, and because of them
fell out with the little one. Summer had begun, and the time for
sowing the high-growing millet had come. The little brother had no
grain, and asked the big one to loan him some, and the big one ordered
his wife to give it to him. But she took the grain, put it in a large
pot and cooked it until it was done. Then she gav

--------------------------------------------------

Story II:
THE THREE RHYMSTERS


Once there were three daughters in a family. The oldest one married a
physician, the second one married a magistrate; but the third, who was
more than usually intelligent and a clever talker, married a farmer.

Now it chanced, once upon a time, that their parents were celebrating
a birthday. So the three daughters came, together with their husbands,
to wish them long life and happiness. The parents-in-

In [17]:
with open('./the-chinese-fairy-tales.txt', 'r', encoding='utf-8') as file:
    text_content = file.read()

In [24]:
import re

# Function to extract chapter names from the table of contents
def extract_chapter_names(text):
    chapter_name_pattern = re.compile(r'\b([IVXLCDM]+)\b\n\n(.*?)\n', re.DOTALL)
    chapter_names = chapter_name_pattern.findall(text)
    return [" ".join([num, name.strip()]) for num, name in chapter_names]

# Function to extract content for each chapter
def extract_chapter_content(text, chapter_names):
    chapters_content = {}
    for i in range(len(chapter_names)):
        current_chapter = chapter_names[i]
        next_chapter = chapter_names[i + 1] if i < len(chapter_names) - 1 else None
        if next_chapter:
            pattern = re.escape(current_chapter) + r'\n\n(.*?)\n\n' + re.escape(next_chapter)
        else:
            pattern = re.escape(current_chapter) + r'\n\n(.*)'
        content = re.search(pattern, text, re.DOTALL)
        if content:
            chapters_content[current_chapter] = content.group(1).strip()
    return chapters_content

# Read the clipped file content
with open('./the-chinese-fairy-tales.txt', 'r', encoding='utf-8') as file:
    text_content = file.read()

# Extracting chapter names
chapter_names = extract_chapter_names(text_content)

# Extracting content for each chapter
chapters_content = extract_chapter_content(text_content, chapter_names)

# Display the first few chapters' content to check
for chapter, content in list(chapters_content.items())[:]:
    print(f"Chapter {chapter}:")
    print(content[:500])  # Displaying only the first 500 characters for brevity
    print("\n" + "-"*50 + "\n")


In [25]:
# Function to extract the stories based on chapter names
def extract_stories(text, chapter_names):
    stories = {}
    for i, chapter_name in enumerate(chapter_names):
        # Find the starting index of the current chapter
        start_index = text.find(chapter_name) + len(chapter_name)
        # Find the starting index of the next chapter
        end_index = text.find(chapter_names[i + 1]) if i < len(chapter_names) - 1 else len(text)
        # Extract the story
        story = text[start_index:end_index].strip()
        stories[chapter_name] = story

    return stories

# Extracting stories
extracted_stories = extract_stories(text_content, chapter_names)

# Display the first few stories to check
for chapter, story in list(extracted_stories.items())[:2]:
    print(f"Chapter {chapter}:")
    print(story[:500])  # Displaying only the first 500 characters for brevity
    print("\n" + "-"*50 + "\n")


Chapter I WOMEN'S WORDS PART FLESH AND BLOOD:
1

--------------------------------------------------

Chapter II THE THREE RHYMSTERS:
4

        III HOW GREED FOR A TRIFLING THING LED A MAN TO LOSE A
              GREAT ONE                                              6

         IV WHO WAS THE SINNER?                                      9

          V THE MAGIC CASK                                          10

         VI THE FAVORITE OF FORTUNE AND THE CHILD OF ILL LUCK       11

        VII THE BIRD WITH NINE HEADS                                13

       VIII THE CAVE OF THE BEASTS                                  17

  

--------------------------------------------------



In [48]:
with open('./the-chinese-fairy-tales.txt', 'r', encoding='utf-8') as file:
    text_content = file.read()

In [49]:
# Extracting chapter titles from the text content

def extract_chapter_titles(text):
    # Regex pattern to extract chapter titles
    chapter_title_pattern = re.compile(r'\b([IVXLCDM]+)\b\s(.*?)\s{2,}', re.DOTALL)
    chapter_titles = chapter_title_pattern.findall(text)
    return [title.strip() for _, title in chapter_titles]

# Extracting chapter titles
chapter_titles = extract_chapter_titles(text_content)

# Display the extracted chapter titles
chapter_titles[:10]  # Displaying only the first 10 titles for brevity



["WOMEN'S WORDS PART FLESH AND BLOOD",
 'THE THREE RHYMSTERS',
 'HOW GREED FOR A TRIFLING THING LED A MAN TO LOSE A',
 'WHO WAS THE SINNER?',
 'THE MAGIC CASK',
 'THE FAVORITE OF FORTUNE AND THE CHILD OF ILL LUCK',
 'THE BIRD WITH NINE HEADS',
 'THE CAVE OF THE BEASTS',
 'THE PANTHER',
 'THE GREAT FLOOD']

In [50]:
chapter_titles = chapter_titles[:74]

In [56]:
import os

# Directory where the files will be saved
dir_path = './fairy-tales-chinese'

# Create the directory if it doesn't exist
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# Function to create empty txt files based on chapter titles
def create_empty_files(chapter_titles, directory):
    for i, title in enumerate(chapter_titles):
        # Creating a file name by replacing spaces with hyphens and converting to lowercase
        file_name = title.lower().replace(' ', '-').replace("'", "").replace(",", "").replace("?", "") + '.txt'
        if i < 9:
            file_name = f'0{i+1}-' + file_name
        else:
            file_name = f'{i+1}-' + file_name
        file_path = os.path.join(directory, file_name)
        # Create an empty file
        with open(file_path, 'w', encoding='utf-8') as file:
            pass  # Just creating an empty file

# Creating empty txt files for each chapter title
create_empty_files(chapter_titles, dir_path)

# Code to display the created file names for confirmation (not executed in this environment)
# print(os.listdir(dir_path))



In [45]:
# Adjusted function to create empty txt files with numbering like "01-xxx.txt"

def create_numbered_empty_files(chapter_titles, directory):
    for i, title in enumerate(chapter_titles, start=1):
        # Formatting the file name with a number prefix and replacing spaces with hyphens
        formatted_title = title.lower().replace(' ', '-').replace("'", "").replace(",", "").replace("?", "")
        file_name = f"{str(i).zfill(2)}-{formatted_title}.txt"
        file_path = os.path.join(directory, file_name)
        # Create an empty file
        with open(file_path, 'w', encoding='utf-8') as file:
            pass  # Just creating an empty file

# Creating numbered empty txt files for each chapter title
create_numbered_empty_files(chapter_titles, dir_path)

# Code to display the created file names for confirmation (not executed in this environment)
# print(os.listdir(dir_path))



In [52]:
def extract_story_content(text, chapter_titles):
    stories = {}
    for i in range(len(chapter_titles)):
        current_title = chapter_titles[i]
        next_title = chapter_titles[i + 1] if i < len(chapter_titles) - 1 else None

        # Finding the first occurrence (in the table of contents) and moving past it
        start_index = text.find(current_title, text.find(current_title) + 1)

        # Finding the start of the next title to mark the end of the current story
        end_index = text.find(next_title) if next_title else len(text)

        # Extracting the story content
        if start_index != -1 and end_index != -1:
            story_content = text[start_index + len(current_title):end_index].strip()
            stories[current_title] = story_content
        else:
            stories[current_title] = "Story content not found."

    return stories

# Assuming 'text_content' contains the content of the file and 'chapter_titles' is the list of chapter titles
# Extracted stories can be accessed from the 'stories' dictionary by their titles
stories = extract_story_content(text_content, chapter_titles)

# Code to save the stories into separate files, using the chapter titles as filenames
for title, story in stories.items():
    file_name = title.lower().replace(' ', '-') + '.txt'
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(story)


In [34]:
stories.keys()

dict_keys(["WOMEN'S WORDS PART FLESH AND BLOOD", 'THE THREE RHYMSTERS', 'HOW GREED FOR A TRIFLING THING LED A MAN TO LOSE A', 'WHO WAS THE SINNER?', 'THE MAGIC CASK', 'THE FAVORITE OF FORTUNE AND THE CHILD OF ILL LUCK', 'THE BIRD WITH NINE HEADS', 'THE CAVE OF THE BEASTS', 'THE PANTHER', 'THE GREAT FLOOD', 'THE FOX AND THE TIGER', "THE TIGER'S DECOY", 'THE FOX AND THE RAVEN', 'WHY DOG AND CAT ARE ENEMIES', 'HOW THE FIVE ANCIENTS BECAME MEN', 'THE HERD BOY AND THE WEAVING MAIDEN', 'YANG OERLANG', 'NOTSCHA', 'THE LADY OF THE MOON', 'THE MORNING AND THE EVENING STAR', "THE GIRL WITH THE HORSE'S HEAD OR THE SILKWORM", 'THE QUEEN OF HEAVEN', 'THE FIRE-GOD', 'THE THREE RULING GODS', 'A LEGEND OF CONFUCIUS', 'THE GOD OF WAR', 'THE HALOS OF THE SAINTS', 'LAOTSZE', 'THE ANCIENT MAN', 'THE EIGHT IMMORTALS (I)', 'THE EIGHT IMMORTALS (II)', 'THE TWO SCHOLARS', 'THE MISERLY FARMER', "SKY O'DAWN", 'KING MU OF DSCHOU', 'THE KING OF HUAI NAN', 'OLD DSCHANG', 'THE KINDLY MAGICIAN', 'THE FLOWER-ELVES', 

In [58]:
"""
Visualizing the storyboard for stories with infinite lengths,
"""

import io
import os
import re
import json
import torch
import openai
import argparse
import gradio as gr
from PIL import Image

from typing import Dict
from pathlib import Path
from datetime import datetime
from pydantic import BaseModel, Field
from pydantic.json import pydantic_encoder
from diffusers import StableDiffusionXLPipeline
from concurrent.futures import ThreadPoolExecutor

from langchain.llms import OpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter


# #########################
# Set the paths
# #########################
# Setting the default values for prompts
# To update them later in the gradio webpage
model_name = 'gpt-4-1106-preview'
chunk_size = 1500
chunk_overlap = 10
openai_api_key = os.getenv('OPENAI_API_KEY')
print(openai_api_key)

# Set the prompt paths for summary and storyboard
instruction_root = Path('./prompts/instructions')
summary_question_prompt_path = instruction_root / 'summary_question.txt'
summary_refine_prompt_path = instruction_root / 'summary_refine.txt'
storyboard_map_prompt_path = instruction_root / 'storyboard_map.txt'


# #########################
# Helper functions
# #########################
# Text Processeing
def load_text(file_path):

    with open(file_path, 'r') as file:
        return file.read()


def save_text(text, file_path):

    with open(file_path, 'w') as file:
        file.write(text + '\n')


def format_scene(storyboard_result):
    """
    This is a temporary sol and would
    be replace by Pydantic modules later.

    storyboard_result (dict): a chain map reduce dict.
    """
    # Initialize a string
    text = ''

    # Format the divide line
    for i in storyboard_result['intermediate_steps']:
        text += '\n---\n\n' + i + '\n'

    # Format the number which was discarded in map reduce
    scenes = text.split('[Scene]')
    text = scenes[0]
    for j, scene in enumerate(scenes[1:], start=1):
        text += f'[Scene {j}]{scene}'

    return text


# #########################
# Split the docs
# #########################
def get_split_docs(story_path: str='./story.txt',
                   chunk_size: int=1500,
                   chunk_overlap: int=0):
    """
    Make stories to be splitted.
    """

    # Load with text loader
    loader = TextLoader(story_path)
    doc = loader.load()

    # Split the story into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap)
    split_docs = text_splitter.split_documents(doc)

    return split_docs


# #########################
# Get the results
# #########################
def get_summary_storyboard(model_name: str='gpt-4-1106-preview',
                           summary_question_prompt_path: str='',
                           summary_refine_prompt_path: str='',
                           storyboard_map_prompt_path: str='',
                           # summary_save_path: str='',
                           # storyboard_save_path: str='',
                           split_docs: tuple=None):

    ##### Get the summary #####
    # Set the chat model
    chat_model = ChatOpenAI(model_name=model_name,
                            openai_api_key=openai_api_key)

    # Load the prompt
    summary_question_prompt = load_text(summary_question_prompt_path)
    summary_refine_prompt = load_text(summary_refine_prompt_path)

    # Set the prompts
    question_prompt = PromptTemplate.from_template(summary_question_prompt)
    refine_prompt = PromptTemplate.from_template(summary_refine_prompt)

    # Run the chain
    summary_chain = load_summarize_chain(
        llm=chat_model,
        chain_type='refine',
        question_prompt=question_prompt,
        refine_prompt=refine_prompt,
        return_intermediate_steps=True,
        input_key='input_documents',
        output_key='output_text',
    )

    # Get the results
    print('Getting the summary for characters and environments.')
    print('This may take a while as i am constantly refining results.')
    summary_result = summary_chain({'input_documents': split_docs})
    summary = summary_result['output_text']


    ##### Get the storyboad #####
    # Load the prompt
    storyboard_map_prompt = load_text(storyboard_map_prompt_path)

    # Set the prompts
    map_prompt = PromptTemplate.from_template(storyboard_map_prompt)
    combine_prompt = PromptTemplate.from_template('Provide an overall summary of {text}.')

    # Run the chain
    storyboard_chain = load_summarize_chain(
        llm=chat_model,
        chain_type='map_reduce',
        map_prompt=map_prompt,
        combine_prompt=combine_prompt,
        return_intermediate_steps=True,
        input_key='input_documents'
    )

    # Get the results
    print('Getting the storyboard script.')
    print('i am faster this time since it is fine to be parallel here.')
    storyboard_result = storyboard_chain(
        {'input_documents': split_docs,
         'summary': summary})
    storyboard = format_scene(storyboard_result)

    # Save them
    print('All done!')
    # save_text(summary, summary_save_path)
    # save_text(storyboard, storyboard_save_path)

    return summary, storyboard


# ################################
# Helper Functions
# ################################
def extract_prompts_from_content(content: str,
                                 keyword: str='Text-to-Image Prompt'):
    """
    Get the text to image prompts from a string,
    allowing for various formats of the keyword.
    """
    prompts = []
    pattern = re.compile(rf'-? \[?\s*{re.escape(keyword)}\s*\]?:\s*(.*)',
                         re.IGNORECASE)

    # Split the file content into lines and process each line
    for line in content.splitlines():
        match = pattern.search(line)
        if match:
            prompt = match.group(1).strip()
            prompts.append(prompt)

    return prompts

#
# def extract_prompts(file_path: str='',
#                     keyword: str='Text-to-Image Prompt'):
#     """
#     Get the text to image prompts from the file,
#     allowing for various formats of the keyword.
#     """
#     prompts = []
#
#     with open(file_path, 'r') as file:
#         content = file.readlines()
#         pattern = re.compile(rf'-? \[?\s*{re.escape(keyword)}\s*\]?:\s*(.*)',
#                              re.IGNORECASE)
#
#         for line in content:
#             match = pattern.search(line)
#             if match:
#                 prompt = match.group(1).strip()
#                 prompts.append(prompt)
#
#     return prompts


# ################################
# Generate the images
# ################################
def generate_images(story_file):
    """
    Generating images with stable diffusions.
    """
    try:
        # Split stories into chunks
        print('Spliting the documents.')
        split_docs = get_split_docs(story_file.name,
                                    chunk_size,
                                    chunk_overlap)

        # Get and save the summary and the storyboard
        summary, storyboard = get_summary_storyboard(model_name,
                                                     summary_question_prompt_path,
                                                     summary_refine_prompt_path,
                                                     storyboard_map_prompt_path,
                                                     # summary_save_path,
                                                     # storyboard_save_path,
                                                     split_docs)

        prompts = extract_prompts_from_content(
            storyboard,
            'Text-to-Image Prompt')

        print("Loading pipeline...")
        pipeline = StableDiffusionXLPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float16,
            variant='fp16',
            use_safetensors=True,
            custom_pipeline='lpw_stable_diffusion_xl').to('cuda')

        print("Pipeline loaded. Generating images...")
        images = []
        for i, prompt in enumerate(prompts):
            print(f"Generating image {i+1}/{len(prompts)}")
            print(prompt)
            image = pipeline(prompt=prompt).images[0]
            images.append(image)
            torch.cuda.empty_cache()

        print("Image generation completed.")
        return images

    except Exception as e:
        print(f'An error occurred: {e}')
        return []

    return None


# ################################
# The Gradio Interface
# ################################
with gr.Blocks() as demo:
    gr.Markdown("# Your Gradio App Title") 
    
    with gr.Row():
        with gr.Column():
            storyboard_file = gr.File(label="Upload your story here.")
            generate_btn = gr.Button("Visualize your stories now!")
        with gr.Column():
            gallery = gr.Gallery(label="Generated Images",
                                 show_label=False,
                                 elem_id='gallery',
                                 columns=[4],
                                 rows=[4],
                                 object_fit="contain",
                                 height="auto")

    generate_btn.click(generate_images,
                       inputs=storyboard_file,
                       outputs=gallery)


if __name__ == '__main__':
    demo.queue().launch(share=True, debug=True)


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
Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://60711fd913f47609d8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://60711fd913f47609d8.gradio.live


In [60]:
with gr.Blocks() as demo:
    gr.Markdown('# Turn your stories into comics!') 
    gr.Markdown('(The movie option will soon be updated.)')
    
    with gr.Row():
        with gr.Column():
            storyboard_file = gr.File(label="Upload your story here.")
            generate_btn = gr.Button("Visualize your stories now!")
        with gr.Column():
            gallery = gr.Gallery(label="Generated Images",
                                 show_label=False,
                                 elem_id='gallery',
                                 columns=[4],
                                 rows=[4],
                                 object_fit="contain",
                                 height="auto")

    generate_btn.click(generate_images,
                       inputs=storyboard_file,
                       outputs=gallery)


if __name__ == '__main__':
    demo.queue().launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://52d95074f21b126b59.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://52d95074f21b126b59.gradio.live
