In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('chained_assignment',None)

In [2]:
def get_soup_from_url(url):
    """
    Query a given url and return the page content as BeautifulSoup object.
    Parameters:
        url: the url to be scraped
    Returns:
        soup: a BeautifulSoup object
    """
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

### script to extract at paragraph level

In [3]:
url = "https://www.congress.gov/resources/display/content/The+Federalist+Papers"
soup = get_soup_from_url(url)

elements = soup.find("div", {"class":"wiki-content"}).findAll('p')
start, stop = [], []
text_collector = []
i = 0
for element in elements:
    text = ''.join(element.findAll(text = True))
    text_collector.append(text)
    if 'To the People of the State of New York:' in text:
        start.append(i)
    elif 'Back to Top' in text:
        stop.append(i)
    i+=1
    
df_text = pd.DataFrame(text_collector)
df_text.columns = ['Text']

all_para_dfs = []
for idx in zip(start,stop):
    sub_df = df_text.iloc[idx[0]:idx[1],:]
    #mask = sub_df['Text'].apply(lambda x: x.split()[0] != 'PUBLIUS.')    
    mask1 = sub_df['Text'].apply(lambda x: x.split()[0] != 'PUBLIUS.' and 
                            x.split()[0][0] not in ['1','2','3','4','5','6','7','8','9'])
    mask2 = sub_df['Text'].apply(lambda x: x.split()[0][0] in ['1','2','3','4','5','6','7','8','9'])
    body_df = sub_df[mask1]
    notes_df = sub_df[mask2]
    publius_df = pd.DataFrame({'Text':['PUBLIUS.']})
    para_df = pd.concat([body_df, publius_df, notes_df]).reset_index().drop(['index'], axis=1)
    #display(para_df)
    all_para_dfs.append(para_df)

### Import federalist df

In [4]:
df_fed = pd.read_csv('../raw_data/federalist.csv')
df_fed["No."] = pd.to_numeric(df_fed["No."], downcast='integer')
df_fed.head()

Unnamed: 0,No.,Title,Author,Publication,Date,Text,Length
0,1,General Introduction,Hamilton,For the Independent Journal,- -,AFTER an unequivocal experience of the ineffic...,9280
1,2,Concerning Dangers from Foreign Force and Infl...,Jay,For the Independent Journal,- -,WHEN the people of America reflect that they a...,10004
2,3,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,IT IS not a new observation that the people of...,8651
3,4,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,MY LAST paper assigned several reasons why the...,9621
4,5,The Same Subject Continued: Concerning Dangers...,Jay,For the Independent Journal,- -,"QUEEN ANNE, in her letter of the 1st July, 170...",8176


In [5]:
no_list = []
title_list = []
author_list = []
paragaph_list = []

i = 0
for para_df in all_para_dfs:
    # get values
    num_paragraph = para_df.shape[0]
    no = [df_fed['No.'][i]]*num_paragraph
    title = [df_fed['Title'][i]]*num_paragraph
    author = [df_fed['Author'][i]]*num_paragraph
    # extend lists
    no_list.extend(no)
    title_list.extend(title)
    author_list.extend(author)
    paragaph_list.extend(list(para_df['Text']))    
    i+=1
    
fed_para_dict = {'No.':no_list,'Title':title_list,'Author':author_list,'ParaText':paragaph_list}
df_fed_paragaph = pd.DataFrame(fed_para_dict)
df_fed_paragaph.head()

### save dataframe
df_fed_paragaph.to_csv("../processed_data/fed_paragraphs.csv", index=False)