In [8]:
def create_speech_df(host, annual_htm_list):
    '''
    Builds a dataframe containing information and links to all of the
    Federal Reserve speeches. This dataframe is later called to scrape the
    actual speeches

    INPUTS:
        host                the host (for the Federal Reserve 'www.federalreserve.gov)
        annual_htm_list     which contains the path to the web sites containing
                             speeches for each year

    OUTPUT:
        df    a dateframe containing the following columns
            ['date]         date of speech
            ['speaker']     speaker
            ['title']       title of speech
            ['link']        link to website with speech text to be scraped
            ['text']        empty column to be populated later with text

    NOTES:
        1. There are two items from 2006 to present that are on the Federal Reserve
            website that are not speeches but reports. These items are removed in this
            function by idenfitying dataframe rows where the speaker is blank

    '''
    all_dates = []
    all_speakers = []
    all_titles = []
    all_links = []
    for item in annual_htm_list:
        date_lst, speaker_lst, title_lst, link_lst =find_speeches_by_year(host,
                                                    item, print_test=False)
        all_dates = all_dates + date_lst
        all_speakers = all_speakers + speaker_lst
        all_titles = all_titles + title_lst
        all_links = all_links + link_lst

    dict1 = {'date': all_dates, 'speaker':all_speakers,
            'title': all_titles, 'link':all_links}
    df = pd.DataFrame.from_dict(dict1)
    #Cleaning up some of the dateframe elemenst to remove brackets
    df['date']=df['date'].str[0]
    #df['date'] = pd.to_datetime(df['date'])
    df['speaker']=df['speaker'].str[0]
    df['title']=df['title'].str[0]
    # creating empty column for documents
    doc = np.zeros_like(df['date'])
    df['text'] = doc

    # removing items that are not speeches. These contain a link that starts with '/pubs/feds'
    delete_these = df[df['link'].str.match('/pubs/feds')].index
    df = df.drop(delete_these)

    # now we need to sort the dataframe so that the most recent period is first
    df.sort_values(by=['date'], ascending = False, inplace = True)
    df.reset_index(drop=True, inplace=True)


In [71]:
def find_speeches_by_year(host, this_url, print_test=False):
    '''
    Takes the host and a url for a given year
    and returns infromation about the speeches and links to the web site
    containing the text of the speeches. This function is used to create
    the list of all web sites that contain the individual speeches that
    need to be scraped.

    INPUTS:
        host        the host (for the Federal Reserve 'www.federalreserve.gov)
        this_url         the path to the speeches for a given year
        print_test  an optional field that will print out summary statistics

    OUTPUT:
        date_lst    list of speech dates
        speaker_lst list of speaker names
        title_lst   list containing titles of speeches
        link_lst    list of htm links to the actual speeches

    NOTES:
        1. There are video links on some of the urls that we need to removed.
            These videos are represented by the 'watchLive' class.

    '''
    conn = HTTPSConnection(host = host)
    conn.request(method='GET', url = this_url)
    resp = conn.getresponse()
    body = resp.read()
    # check that we received the correct response code
    if resp.status != 200:
        print('Error from Web Site! Response code: ', resp.status)
    else:
        soup=BeautifulSoup(body, 'html.parser')
        event_list = soup.find('div', id='article')
        # creating the list of dates, titles, speakers and html articles from web page
        month_lst =[]
        date_lst = []
        link_lst = []

        for row in event_list.find_all('div', class_='row fomc-meeting'):
            tmp_month = [x.text for x in row.find_all('fomc-meeting__month')]
            month_lst.append(tmp_month)
            
            tmp_date= [x.text for x in row.find_all('fomc-meeting__date')]
            date_lst.append(tmp_date)

            #  tmp_speaker = [x.text for x in row.find_all('p', class_='news__speaker')]
            #  speaker_lst.append(tmp_speaker)

            tmp_link = [x.text for x in row.find_all('href')]
            link_lst.append(tmp_link)

            # some of the links include video with the transcript. We are deleteing these here
            #for link in event_list.find_all('a', href=True, class_ = lambda x: x != 'watchLive'):
            #    link_lst.append(link['href'])

        if print_test:
            print('length of months: ', len(month_lst))
            print('length of dates: ', len(date_lst))
            print('length of href: ', len(link_lst))

        return month_lst, date_lst, link_lst


In [151]:
link_list = []
#for row in event_list.find_all('div', class_='row fomc-meeting'):
    #print(row)

for link in event_list.findAll('a', href=True):
    link_list.append(link.get('href'))
    
    
    
    

In [152]:
link_list

['https://www.federalreserve.gov/monetarypolicy/materials/',
 '//www.fedsearch.org/fomc-docs/?advanced_search=true',
 '/foia/fomc/annualreports.htm',
 '/foia/fomc/servicecenter.htm',
 '#18178',
 '#14659',
 '#7744',
 '#7589',
 '#7562',
 '#7537',
 '/monetarypolicy/files/monetary20190130a1.pdf',
 '/newsevents/pressreleases/monetary20190130a.htm',
 '/newsevents/pressreleases/monetary20190130a1.htm',
 '/monetarypolicy/fomcpresconf20190130.htm',
 '/newsevents/pressreleases/monetary20190130b.htm',
 '/newsevents/pressreleases/monetary20190130c.htm',
 '/monetarypolicy/files/fomcminutes20190130.pdf',
 '/monetarypolicy/fomcminutes20190130.htm',
 '/monetarypolicy/files/monetary20190320a1.pdf',
 '/newsevents/pressreleases/monetary20190320a.htm',
 '/newsevents/pressreleases/monetary20190320a1.htm',
 '/monetarypolicy/fomcpresconf20190320.htm',
 '/monetarypolicy/files/fomcprojtabl20190320.pdf',
 '/monetarypolicy/fomcprojtabl20190320.htm',
 '/newsevents/pressreleases/monetary20190320c.htm',
 '/monetary

In [154]:
# clean this link to only include newsevents/pressreleases/monetarypolicy/
print(len(link_list))

259


In [167]:
keep_these = []
for i in range(len(link_list)):
    this_href = link_list[i]
    #print(this_href)
    #print(type(this_href))
    if 'newsevents/pressreleases/' in this_href: 
        keep_these.append(i)

In [171]:
final_links = []
for item in keep_these:
    final_links.append(link_list[item])
print(len(link_list))
print(len(keep_these))
print(len(final_links))

259
76
76


<div class="panel panel-default"><div class="panel-heading"><h4><a id="7537">2014 FOMC Meetings </a></h4></div>
<div class="row fomc-meeting">
<div class="fomc-meeting__month col-xs-5 col-sm-3 col-md-2"><strong>January</strong></div>
<div class="fomc-meeting__date col-xs-4 col-sm-9 col-md-10 col-lg-1">28-29</div>
<div class="col-xs-12 col-md-4 col-lg-2">
<a href="/newsevents/pressreleases/monetary20140129a.htm">Statement</a><br/>
</div>
<div class="col-xs-12 col-md-4 col-lg-3">
</div>
<div class="col-xs-12 col-md-4 col-lg-4 fomc-meeting__minutes">
<strong>Minutes:</strong><br/>
<a href="/monetarypolicy/files/fomcminutes20140129.pdf">PDF</a> | <a href="/monetarypolicy/fomcminutes20140129.htm">HTML</a>
<br/> (Released February 19, 2014)
                    	
                    	
                    	</div>
</div>
<div class="fomc-meeting--shaded row fomc-meeting" style="border-bottom: none;">
<div class="fomc-meeting__month col-xs-5 col-sm-3 col-md-2"><strong>March </strong></div>
<div 

In [81]:
event_list

<div class="col-xs-12 col-sm-8 col-md-9" id="article">
<h3>Meeting calendars, statements, and minutes (2014-2019)</h3>
<!-- Article Area -->
<div class="col-xs-12 col-sm-12 col-md-5 pull-right" id="floatRightRail">
<div class="panel panel-related">
<div class="panel-heading"><h5 class="panel-title text-capitalize">FOMC Search</h5></div>
<div class="panel-body" style="padding-bottom:5px;">
<ul class="panel__list list-unstyled">
<li class="panel__listItem">Search all FOMC materials</li>
<li class="panel__listItem" style="padding-bottom: 20px;">
<form action="//www.fedsearch.org/fomc-docs/search" class="form-inline ng-pristine ng-valid" method="GET">
<div class="input-group">
<input class="form-control" id="fomcsearchbox" maxlength="90" name="text" type="text"/>
<span class="input-group-btn">
<button class="btn" type="submit">
<span class="icon icon--centered icon__sm icon-next"></span>
</button>
</span>
</div>
</form>
</li>
<li class="panel__listItem"><a class="noIcon" href="https://www.

# testing new FOMC statement scraping

In [55]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from http.client import HTTPSConnection
import pickle
from urllib.request import urlopen
import requests
import os

    

In [72]:
l_month, l_date, l_link = find_speeches_by_year(host, prefix, print_test=False)


In [73]:
l_link

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [62]:
soup=BeautifulSoup(body, 'html.parser')
event_list = soup.find('div', id='article')
event_list


<div class="col-xs-12 col-sm-8 col-md-9" id="article">
<h3>Meeting calendars, statements, and minutes (2014-2019)</h3>
<!-- Article Area -->
<div class="col-xs-12 col-sm-12 col-md-5 pull-right" id="floatRightRail">
<div class="panel panel-related">
<div class="panel-heading"><h5 class="panel-title text-capitalize">FOMC Search</h5></div>
<div class="panel-body" style="padding-bottom:5px;">
<ul class="panel__list list-unstyled">
<li class="panel__listItem">Search all FOMC materials</li>
<li class="panel__listItem" style="padding-bottom: 20px;">
<form action="//www.fedsearch.org/fomc-docs/search" class="form-inline ng-pristine ng-valid" method="GET">
<div class="input-group">
<input class="form-control" id="fomcsearchbox" maxlength="90" name="text" type="text"/>
<span class="input-group-btn">
<button class="btn" type="submit">
<span class="icon icon--centered icon__sm icon-next"></span>
</button>
</span>
</div>
</form>
</li>
<li class="panel__listItem"><a class="noIcon" href="https://www.

In [45]:
#https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm
host = 'www.federalreserve.gov'
prefix = '/monetarypolicy/fomccalendars.htm'
this_url = host + prefix
print(this_url)

www.federalreserve.gov/monetarypolicy/fomccalendars.htm


In [31]:
conn = HTTPSConnection(host = host)
conn.request(method='GET', url = prefix)
resp = conn.getresponse()
body = resp.read()
body

b'\xef\xbb\xbf<!doctype html>\n<html lang="en" class="no-js">\n    <head>\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge"/>\n        <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1"/>\n        <meta name="keywords" content="Board of Governors of the Federal Reserve System, Federal Reserve Board of Governors, Federal Reserve Board, Federal Reserve" />\n        <meta name="description" content="The Federal Reserve Board of Governors in Washington DC." />\n        <meta property="og:site_name" content="Board of Governors of the Federal Reserve System"/>\n        <meta property="og:type" content="article" /> \n        <meta property="og:image"  content="" /> \n        <meta name="twitter:card" content="summary" />\n        <meta name="twitter:image" content="" />\n        \r\n\r\n\r\n \n        <title>The Fed - Meeting calendars and information</title>\n        \n    <li

In [33]:
resp.status

200

In [34]:
soup=BeautifulSoup(body, 'html.parser')
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible">
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1" name="viewport"/>
<meta content="Board of Governors of the Federal Reserve System, Federal Reserve Board of Governors, Federal Reserve Board, Federal Reserve" name="keywords"/>
<meta content="The Federal Reserve Board of Governors in Washington DC." name="description"/>
<meta content="Board of Governors of the Federal Reserve System" property="og:site_name"/>
<meta content="article" property="og:type"/>
<meta content="" property="og:image"/>
<meta content="summary" name="twitter:card"/>
<meta content="" name="twitter:image"/>
<title>The Fed - Meeting calendars and information</title>
<link href="/css/bootstrap.css" rel="stylesheet" type="text/css"/>
<link href="/css/bluesteel-theme.css" rel="stylesheet" type="text/css"/>
<script src="/js/mod

In [27]:
resp.status

400

In [22]:
body

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">\r\n<HTML><HEAD><TITLE>Bad Request</TITLE>\r\n<META HTTP-EQUIV="Content-Type" Content="text/html; charset=us-ascii"><script type="text/javascript" >\n  (function(i,s,o,g,r,a,m){i[\'GoogleAnalyticsObject\']=r;i[r]=i[r]||function(){\n  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),\n  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)\n  })(window,document,\'script\',\'//www.google-analytics.com/analytics.js\',\'ga\');\n  ga(\'create\', \'UA-35121701-1\', \'federalreserve.gov\', {\'cookieExpires\': 0});\n  ga(\'set\', \'anonymizeIp\', true);\n  ga(\'send\', \'pageview\');\n</script></head>\r\n<BODY><h2>Bad Request - Invalid URL</h2>\r\n<hr><p>HTTP Error 400. The request URL is invalid.</p>\r\n<script type="text/javascript" src="/resources/track_downloads.js"></script>\n</body></HTML>\r\n'