In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
# text file contains html collected from four pages of American Rhetoric containing links to speeches
f = open("/kaggle/input/american-rhetoric-html/american-rhetoric-html.txt", "r").read()

In [3]:
# create dictionary to be converted into DataFrame
data = {'Link':[], 'Author':[], 'Date': [], 'Content':[]}

# search for links in HTML
links = []
for item in f.split('href="'):
    # if link begins with 'speeches' (i.e. americanrhetoric.com/speeches/something.html)
    if item[:8] == 'speeches':
        # add url
        links.append('https://www.americanrhetoric.com/'+item.split('">')[0])

In [4]:
# there are 1,692 total speeches
links

['https://www.americanrhetoric.com/speeches/1is2manycampusassaultpsa.htm',
 'https://www.americanrhetoric.com/speeches/aaronswartzf2cconference.htm',
 'https://www.americanrhetoric.com/speeches/gettysburgaddress.htm',
 'https://www.americanrhetoric.com/speeches/abbott&amp;costellowhosonfirst.htm',
 'https://www.americanrhetoric.com/speeches/gettysburgaddress.htm',
 'https://www.americanrhetoric.com/speeches/abdullatifbinrashidalzayaniabrahamaccordssigning.htm',
 'https://www.americanrhetoric.com/speeches/abdullatifbinrashidalzayaniflight973airportceremony.htm',
 'https://www.americanrhetoric.com/speeches/abdullahbinzayedalnahyanabrahamaccordssigning.htm',
 'https://www.americanrhetoric.com/speeches/abdullatifbinrashidalzayaniabrahamaccordssigning.htm',
 'https://www.americanrhetoric.com/speeches/abrahamlincolnmissouricompromiserepeal.htm',
 'https://www.americanrhetoric.com/speeches/abrahamlincolnhousedivided.htm',
 'https://www.americanrhetoric.com/speeches/abrahamlincolncooperunionad

In [5]:
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup

# collect raw text from the website
def get_content_from_website(url):
    try:
        fp = urllib.request.urlopen(url)
        mybytes = fp.read()

        mystr = mybytes.decode("utf-8", 'ignore').encode('utf-8')
        fp.close()

        html = mystr
        soup = BeautifulSoup(html, features="html.parser")
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        return text
    except: 
        return False

    
# custom function to extract information from the text from the website by selecting relevant material
# this custom function is highly specific to the American Rhetoric website
def extract_info(text):
    
    # if the output of the content retrieval is False (there was an error), return False
    if text == False: return False
    
    # remove extra whitespace and lines
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ' '.join(chunk for chunk in chunks if chunk).strip()
    
    # remove extra symbols
    text = text.replace('\n','')
    text = text.replace('\t','')
    text = text.replace('\xa0','')
    
    # retrieve date
    date = 'No Date'
    if 'delivered' in text:
        date = text.split('delivered')[1].split(',')[0].strip()
    elif 'Delivered' in text:
        date = text.split('Delivered')[1].split(',')[0].strip()
        
    # retrieve author
    if 'American Rhetoric: ' in text:
        text = text.split('American Rhetoric: ')[1]
    author = text.split('-')[0].strip()
    if ':' in author: author = author.split(':')[0].strip()
    
    # trim beginning of text
    for keyphrase in ['AUTHENTICITY CERTIFIED: ', 'from audio.] ', 'from audio] ', 'from audio. (2)]', 'support the audio element.']:
        if keyphrase in text:
            text = text.split(keyphrase)[1]
    
    # trim ending of text
    for keyphrase in ['Book/CDs by', 'Original Text Source:', 'Page Updated:', 'Also in this database:']:
        if keyphrase in text:
            text = text.split(keyphrase)[0]
    
    # do final trimming
    text = text.strip()
    
    return text, date, author

In [6]:
# loop through data, collect author, date, and content
for link in tqdm(links):
    unparsed_content = get_content_from_website(link)
    if unparsed_content == False: continue
    text, date, author = extract_info(unparsed_content)
    data['Link'].append(link)
    data['Author'].append(author)
    data['Date'].append(date)
    data['Content'].append(text)

  0%|          | 0/1692 [00:00<?, ?it/s]

In [7]:
print(len(data['Author']))
print(len(data['Date']))
print(len(data['Content']))
print(len(data['Link']))

1652
1652
1652
1652


In [8]:
# put in DataFrame and save
data = pd.DataFrame(data)
data.to_csv('american-rhetoric-speeches.csv')