# Load Packages

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.request import urlopen
import numpy as np

# Class SpeechScraping: operations related to get the speech texts

In [2]:
class SpeechScraping(object):

    def __init__(self,url):
        self.url = url

    #get the html
    def get_html(self,url):
        return  urlopen(url=url).read()

    #get the data frame of the dates, speech titles, and the hyperlinks from the website
    def get_links(self,html):
        soup = BeautifulSoup(html,'lxml')
        contents = soup.find('table',id='AutoNumber1').find_all('tr')
        dates = []   #date of the speech
        titles = []  #title of the speech
        links = []  #hyperlink of the speech

        for ct in contents:
            tmp = ct.find('font', attrs={'face':'Tahoma','size':'4'})
            if tmp:
                dates.append(tmp.get_text())
                link = ct.find('a',href=True)
                links.append(link['href'])
                titles.append(link.find('font').get_text())
        
        dict = {'dates':dates,'titles':titles,'links':links}
        df = pd.DataFrame(dict)
        return df

    #get the speech text from the website
    def get_speech(self,url):
        website = 'https://www.americanrhetoric.com/'
        soup = BeautifulSoup(self.get_html(url=website+url),'lxml')
        paragraph = soup.find_all('font', attrs={'face':'Verdana','size':'2'})
        speech = [] 
        for p in paragraph:
            if p:
                text = p.get_text().replace('\r','').replace('\n','').replace('\t','')
                if text:
                    if text.startswith('Book/CDs'):
                        continue
                    else:
                            speech.append(text)
        if speech:
            return ' '.join(speech)
        else:
            return np.nan 

    #get all the information for the analysis, including the text, date, title, link of the speeches
    def get_info(self):
        #get links  of obama's speeches
        html = self.get_html(self.url)
        obama_df = self.get_links(html)

        #get the speech text
        speech = []
        for i, row in obama_df.iterrows():
            if i % 50 == 0:
                print(str(i)+' speeches have been accessed')
            speech.append(self.get_speech(row['links']))

        obama_df = obama_df.assign(speeches = speech)
        #drop the rows which do not get the speech back
        obama_df.dropna(axis=0,how='any',inplace=True,subset=['speeches'])
        
        return obama_df


# To get the texts

In [3]:
#get the basic information of each speech, like link, date, title
obama_url = 'https://www.americanrhetoric.com/barackobamaspeeches.htm'
ss = SpeechScraping(url=obama_url)
obama = ss.get_info()

0 speeches have been accessed
50 speeches have been accessed
100 speeches have been accessed
150 speeches have been accessed
200 speeches have been accessed
250 speeches have been accessed
300 speeches have been accessed
350 speeches have been accessed
400 speeches have been accessed
450 speeches have been accessed


# Export speeches

In [4]:
obama.to_csv('obama_speech.csv', encoding='utf-8',index=False)

In [5]:
obama

Unnamed: 0,dates,titles,links,speeches
0,02 Oct 2002,Federal Plaza Address Opposing the War in Iraq,speeches/barackobama/barackobamairaqwarspeechf...,Good afternoon. Let begin by saying that altho...
1,27 Jul 2004,Democratic National Convention Keynote Speech,speeches/convention2004/barackobama2004dnc.htm,"On behalf of the great state of Illinois, cros..."
2,06 Jan 2005,Senate Speech on Ohio Electoral Vote,speeches/barackobama/barackobamasenatespeechon...,"Thank you very much, Mr. President; Ladies and..."
3,04 Jun 2005,Knox College Commencement Speech,speeches/barackobamaknoxcollege.htm,"Good morning President Taylor, Board of Truste..."
4,25 Oct 2005,Senate Speech Honoring the Life of Rosa Parks,speeches/barackobama/barackobamasenatespeechro...,"Mr. President, today the nation mourns a genui..."
...,...,...,...,...
472,12 Jan 2017,Joe Biden Medal of Freedom Presentation Address,speeches/barackobama/barackobamajoebidenmedalo...,"Welcome to the White House, everybody. As I h..."
473,14 Jan 2017,Final Presidential Weekly Address,speeches/barackobama/barackobamafinalweeklyadd...,"This week, I traveled to Chicago to deliver my..."
474,16 Jan 2017,MLB World Series Champion Chicago Cubs WH Visit,speeches/barackobama/barackobamachicagocubsMLB...,They said this day would never come. Here is s...
475,18 Jan 2017,Final Presidential Press Conference,speeches/barackobama/barackobamafinalpressconf...,"Good afternoon, everybody. Let me start out b..."
