# Experiments with Text Extraction

## Imports

General imports

In [40]:
import os
import random
import requests
import sys

from bs4 import BeautifulSoup


Enable imports within the project

In [45]:
# Add the parent directory to the path so we can import modules from the parent directory.
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import lcats.constants as constants
import lcats.utils as utils
import lcats.gatherers.downloaders as downloaders
import lcats.gatherers.extractors as extractors
import lcats.gatherers.lovecraft.gutenberg as lovecraft

In [46]:
if True:  # Code to reload modules if we make local code changes, off by default.
    from importlib import reload
    reload(downloaders)
    reload(lovecraft)
    reload(extractors)


## Fixing the Lovecraft Imports

In [None]:
downloaders.get_page_encoding('https://www.gutenberg.org/cache/epub/68283/pg68283-images.html')

In [None]:
lovecraft.main()

In [None]:
lovecraft_files = lovecraft.THE_LOVECRAFT_FILES
len(lovecraft_files)

In [None]:
lovecraft_files[0]

## Find Broken URLs

In [None]:
downloads = []
for story in lovecraft_files:
    print("Story:", story)
    url = lovecraft_files[story] 
    print(" - url:", url)
    contents = downloaders.load_page(url)
    print(" - contents:", contents[:100])
    soup = BeautifulSoup(contents, "lxml")
    if soup.title:
        print(" - title:", soup.title.string)
    else:
        print(" - title: None")
    print()
    downloads.append((story, url, contents, soup))

print(len(downloads))


## Fix Gathering for Known URLs

In [None]:
story_title = 'the_call_of_cthulhu'
story_url = lovecraft_files[story_title]

response = requests.get(story_url)
print(f"Detected encoding: {response.encoding}")
story_encoding = response.encoding

story_content = downloaders.load_page(story_url, encoding=story_encoding)
story_callback = lovecraft.create_download_callback(
    story_name=story_title,
    url=story_url,
    start_heading_text=story_title,
    description=story_title
)
gatherer = downloaders.DataGatherer(
    "lovecraft",
    description="H.P. Lovecraft stories",
    license="Public Domain",
)

In [None]:
gatherer.download(story_title, story_url, story_callback)

In [None]:
START_SEPARATOR = 'pg-start-separator'
END_SEPARATOR = 'pg-end-separator'
CONTENT_TAGS = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']

def extract_tags_between_ids(soup, start_id, end_id, content_tags):
    start_tag = soup.find(id=start_id)
    end_tag = soup.find(id=end_id)
    current_tag = start_tag.find_next()
    matching_tags = []
    while current_tag and current_tag != end_tag:
        if current_tag.name in content_tags:
            matching_tags.append(current_tag)
        current_tag = current_tag.find_next()
    
    return matching_tags


def extract_text_from_tags(tags, separator="\n\n"):
    collected_text = []
    for tag in tags:
        tag_text = tag.get_text(" ", strip=True)
        if tag_text:
            collected_text.append(tag_text)
    
    return separator.join(collected_text)


# Create a BeautifulSoup object
soup = BeautifulSoup(story_content, 'lxml')
matching_tags = extract_tags_between_ids(soup, START_SEPARATOR, END_SEPARATOR, CONTENT_TAGS)
matching_text = extract_text_from_tags(matching_tags)
print()
print(len(matching_tags), len(matching_text))
print()
print(matching_tags[:5])
print()
print(utils.sm(matching_text, 1000))



In [None]:
print(matching_text[-1000:])

In [None]:
example_tag = random.choice(matching_tags)
print(example_tag.name, type(example_tag))

In [None]:
DEFAULT = [1, 2]

def deefault():
    thing = DEFAULT
    thing.append(3)
    return thing

deefault(), DEFAULT

In [38]:
import codecs

In [None]:
codecs.lookup('garbage')