# Notebook 01 - Scrape Federalist Papers

### Import requirements

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('chained_assignment',None)

### Global variables

In [2]:
url = "https://www.congress.gov/resources/display/content/The+Federalist+Papers"
author_names = ['Madison','Hamilton','Jay']
raw_data_path = "../raw_data/"

### Helper functions

In [3]:
def get_soup_from_url(url):
    """
    Query a given url and return the page content as BeautifulSoup object.
    Parameters:
        url: the url to be scraped
    Returns:
        soup: a BeautifulSoup object
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

def parse_federalist_text(soup):
    """
    Extract the text information based on known structure of Federalist papers.
    Parameters:
        soup: a BeautifulSoup object
    Returns:
        all_texts: a list of the scraped texts
    """
    text_block = soup.find_all("div", class_="wiki-content")[0].get_text()
    all_texts = []
    n = 1
    for string in text_block.split('|| Federalist No.')[1:]:
        string = string.replace(u'\xa0', u' ')
        string = '|| Federalist No.' + string
        string = string.split('To the People of the State of New York:')[1]
        string = string.split('↑ Back to Top')[0]
        string = string.split('PUBLIUS')[0]
        all_texts.append(string)
        n += 1
    return all_texts

def extract_federalist_table(soup):
    """
    Extract table information and return as list object.
    Parameters:
        soup: a BeautifulSoup object
    Returns:
        all_rows: a list of lists containing table information
    """
    count = 0
    row = []
    all_rows = []
    for i in soup.find_all("td", class_="confluenceTd"):
        count += 1
        if count%5 != 0:
            row.append(i.get_text())
        else:
            row.append(i.get_text())
            all_rows.append(row)
            row = []
    return all_rows

def federalist_data_to_dataframe(all_rows, all_texts):
    """
    Store table data and text data into a single pandas dataframe.
    Parameters:
        all_rows: a list of lists containing table information
        all_texts: a list of the scraped texts
    Returns:
        federalist_dataframe: a pandas dataframe combining the two data sources
    """
    federalist_dataframe = pd.DataFrame(all_rows)
    federalist_dataframe.columns = ['No.','Title','Author','Publication','Date']
    federalist_dataframe.loc[:,'Text'] = all_texts
    federalist_dataframe.loc[:,'Length'] = federalist_dataframe['Text'].apply(len)
    # fix Madison entries
    federalist_dataframe['Author'] = federalist_dataframe['Author'].apply(lambda x: 'Madison' if 'Madison' in x else x)
    return federalist_dataframe

def select_papers_by_author(federalist_dataframe, author_name):
    """Provide name and return dataframe containing only this author."""
    author_dataframe = federalist_dataframe[federalist_dataframe['Author'] == author_name].reset_index().drop(['index'], axis=1)
    return author_dataframe

def save_to_path(author_dataframe, filename, raw_data_path):
    """Provide dataframe and save to filename"""
    save_path = raw_data_path + filename + ".csv"
    author_dataframe.to_csv(save_path, index=False)

### Main function

In [4]:
def main(url, author_names, raw_data_path):
    soup = get_soup_from_url(url)
    all_texts = parse_federalist_text(soup)
    all_rows = extract_federalist_table(soup)
    federalist_dataframe = federalist_data_to_dataframe(all_rows, all_texts)
    save_to_path(federalist_dataframe, "federalist", raw_data_path)
    # iterates through the authors of interest
    for author_name in author_names:
        author_dataframe = select_papers_by_author(federalist_dataframe, author_name)
        author_dataframe["No."] = pd.to_numeric(author_dataframe["No."], downcast='integer')
        print(author_name, author_dataframe.shape)
        save_to_path(author_dataframe, author_name, raw_data_path)
    print("Complete")

### Output

In [5]:
main(url, author_names, raw_data_path)

df_mad = pd.read_csv("../raw_data/Madison.csv")
df_ham = pd.read_csv("../raw_data/Hamilton.csv")
df_jay = pd.read_csv("../raw_data/Jay.csv")

display(df_mad.head())
display(df_ham.head())
display(df_jay.head())

Madison (29, 7)
Hamilton (51, 7)
Jay (5, 7)
Complete


Unnamed: 0,No.,Title,Author,Publication,Date,Text,Length
0,10,The Same Subject Continued: The Union as a Saf...,Madison,From the New York Packet,"Friday, November 23, 1787",AMONG the numerous advantages promised by a we...,17835
1,14,Objections to the Proposed Constitution from E...,Madison,From the New York Packet,"Friday, November 30, 1787","WE HAVE seen the necessity of the Union, as ou...",12641
2,18,The Same Subject Continued: The Insufficiency ...,Madison,For the Independent Journal,- -,"AMONG the confederacies of antiquity, the most...",12831
3,19,The Same Subject Continued: The Insufficiency ...,Madison,For the Independent Journal,- -,"THE examples of ancient confederacies, cited i...",12487
4,20,The Same Subject Continued: The Insufficiency ...,Madison,From the New York Packet,"Tuesday, December 11, 1787",THE United Netherlands are a confederacy of re...,9570


Unnamed: 0,No.,Title,Author,Publication,Date,Text,Length
0,1,General Introduction,Hamilton,For the Independent Journal,- -,AFTER an unequivocal experience of the ineffic...,9280
1,6,Concerning Dangers from Dissensions Between th...,Hamilton,For the Independent Journal,- -,THE three last numbers of this paper have been...,11822
2,7,The Same Subject Continued: Concerning Dangers...,Hamilton,For the Independent Journal,- -,"IT IS sometimes asked, with an air of seeming ...",13735
3,8,The Consequences of Hostilities Between the St...,Hamilton,From the New York Packet,"Tuesday, November 20, 1787",ASSUMING it therefore as an established truth ...,12065
4,9,The Union as a Safeguard Against Domestic Fact...,Hamilton,For the Independent Journal,- -,A FIRM Union will be of the utmost moment to t...,11925


Unnamed: 0,No.,Title,Author,Publication,Date,Text,Length
0,2,Concerning Dangers from Foreign Force and Infl...,Jay,For the Independent Journal,- -,WHEN the people of America reflect that they a...,10004
1,3,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,IT IS not a new observation that the people of...,8651
2,4,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,MY LAST paper assigned several reasons why the...,9621
3,5,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,"QUEEN ANNE, in her letter of the 1st July, 170...",8176
4,64,The Powers of the Senate,Jay,From the New York Packet,"Friday, March 7, 1788","IT IS a just and not a new observation, that e...",13466
