### Import requirements

In [1]:
%matplotlib inline

import pandas as pd
pd.set_option('chained_assignment',None)
import numpy as np
from collections import Counter
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

### Scrape text with BeautifulSoup

In [2]:
# queries the main url
url = 'https://www.congress.gov/resources/display/content/The+Federalist+Papers'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# parses the text information
text_block = soup.find_all("div", class_="wiki-content")[0].get_text()
all_texts = []
n = 1
for string in text_block.split('|| Federalist No.')[1:]:
    string = string.replace(u'\xa0', u' ')
    string = '|| Federalist No.' + string
    string = string.split('To the People of the State of New York:')[1]
    string = string.split('↑ Back to Top')[0]
    string = string.split('PUBLIUS')[0]
    all_texts.append(string)
    n += 1

# extracts table information
count = 0
row = []
all_rows = []
for i in soup.find_all("td", class_="confluenceTd"):
    count += 1
    if count%5 != 0:
        row.append(i.get_text())
    else:
        row.append(i.get_text())
        all_rows.append(row)
        row = []

### Store info in dataframe

In [3]:
df = pd.DataFrame(all_rows)
df.columns = ['No.','Title','Author','Publication','Date']

# add text and text length columns
df.loc[:,'Text'] = all_texts
df = df[['No.','Title','Author','Text']]
df.loc[:,'Length'] = df['Text'].apply(len)

display(df.head())

Unnamed: 0,No.,Title,Author,Text,Length
0,1,General Introduction,Hamilton,AFTER an unequivocal experience of the ineffic...,9280
1,2,Concerning Dangers from Foreign Force and Infl...,Jay,WHEN the people of America reflect that they a...,10004
2,3,The Same Subject Continued: Concerning Dangers...,Jay,IT IS not a new observation that the people of...,8651
3,4,The Same Subject Continued: Concerning Dangers...,Jay,MY LAST paper assigned several reasons why the...,9621
4,5,The Same Subject Continued: Concerning Dangers...,Jay,"QUEEN ANNE, in her letter of the 1st July, 170...",8176


### Train/test split
Many of the Federalist Papers are multi-part installments or continuations of earlier subjects. This could cause problems when it comes to train/test splitting because we want our classifier to identify authorship and not similar thematic content. Accordingly, selected the indices for splitting the datasets that ensure that continuations of the same subject were not assigned to both training and testing sets.

In [4]:
print(Counter(df.Author))

# select according to author
df_ham = df[df['Author'] == 'Hamilton'].reset_index().drop(['index'], axis=1)
df_mad = df[df['Author'] == 'Madison'].reset_index().drop(['index'], axis=1)
df_jay = df[df['Author'] == 'Jay'].reset_index().drop(['index'], axis=1)

print(df_ham.shape, df_mad.shape, df_jay.shape)

# assign first 30 papers from Hamilton to training
df_ham_train = df_ham.iloc[:30,:].reset_index().drop(['index'], axis=1)
df_ham_test = df_ham.iloc[30:,:].reset_index().drop(['index'], axis=1)

# assign first 10 papers from Madison to training
df_mad_train = df_mad.iloc[:10,:].reset_index().drop(['index'], axis=1)
df_mad_test = df_mad.iloc[10:,:].reset_index().drop(['index'], axis=1)

# assign first 3 papers from Madison to training
df_jay_train = df_jay.iloc[:4,:].reset_index().drop(['index'], axis=1)
df_jay_test = df_jay.iloc[4:,:].reset_index().drop(['index'], axis=1)

Counter({'Hamilton': 51, 'Madison': 15, 'Hamilton or Madison': 11, 'Jay': 5, 'Hamilton and Madison': 3})
(51, 5) (15, 5) (5, 5)


### Expand dataset into sentences

In [5]:
def split_sentence(df):
    all_sentences = []
    for t in df['Text']:
        sentences = t.split('.')
        for s in sentences:
            if len(s.strip()) > 10:
                all_sentences.append(s.strip())
    return all_sentences

def expand_df(df):
    df_out = pd.DataFrame()
    sentences = split_sentence(df)
    df_out.loc[:,'sentence'] = sentences
    df_out.loc[:,'author'] = df['Author'][0]
    return df_out

# expand dataframe to sentence-level
df_ham_train_passages = expand_df(df_ham_train)
df_mad_train_passages = expand_df(df_mad_train)
df_jay_train_passages = expand_df(df_jay_train)
df_ham_test_passages = expand_df(df_ham_test)
df_mad_test_passages = expand_df(df_mad_test)
df_jay_test_passages = expand_df(df_jay_test)

### Combine and save

In [6]:
# concatenate into training and testing sets
df_train = pd.concat([df_ham_train_passages, df_mad_train_passages, df_jay_train_passages], axis=0)
df_test = pd.concat([df_ham_test_passages, df_mad_test_passages, df_jay_test_passages], axis=0)

print(Counter(df_train['author']))
print(Counter(df_test['author']))

df_train.to_csv('../data/train.csv', index=False)
df_test.to_csv('../data/test.csv', index=False)

Counter({'Hamilton': 1597, 'Madison': 787, 'Jay': 145})
Counter({'Hamilton': 1384, 'Madison': 326, 'Jay': 55})


### Balanced data

In [21]:
df_ham_train_750 = df_ham_train_passages.sample(n=750, random_state=1).reset_index().drop(['index'], axis=1)
df_mad_train_750 = df_mad_train_passages.sample(n=750, random_state=1).reset_index().drop(['index'], axis=1)
df_ham_test_300 = df_ham_test_passages.sample(n=300, random_state=1).reset_index().drop(['index'], axis=1)
df_mad_test_300 = df_mad_test_passages.sample(n=300, random_state=1).reset_index().drop(['index'], axis=1)

df_train_balanced = pd.concat([df_ham_train_750, df_mad_train_750], axis=0)
df_test_balanced = pd.concat([df_ham_test_300, df_mad_test_300], axis=0)

print(Counter(df_train_balanced['author']))
print(Counter(df_test_balanced['author']))

df_train_balanced.to_csv('../data/train_balanced.csv', index=False)
df_test_balanced.to_csv('../data/test_balanced.csv', index=False)

Counter({'Hamilton': 750, 'Madison': 750})
Counter({'Hamilton': 300, 'Madison': 300})
