# Notebook 03 - Create Datasets

### Import requirements

In [14]:
from nltk import tokenize
import pandas as pd
pd.set_option('chained_assignment',None)

### Read in raw data

In [10]:
df_mad = pd.read_csv("../raw_data/Madison.csv")
df_ham = pd.read_csv("../raw_data/Hamilton.csv")
df_jay = pd.read_csv("../raw_data/Jay.csv")

### Train/Test split
Many of the Federalist Papers are multi-part installments or continuations of earlier subjects. This could cause problems when it comes to train/test splitting because we want our classifier to identify authorship and not similar thematic content. Accordingly, selected the indices for splitting the datasets that ensure that continuations of the same subject were not assigned to both training and testing sets.

This approach is only to be used for the first version of the model. Later approaches will use additional writing samples scraped from the National Archives in order ot make authorship predictions on the Federalist Papers.

In [40]:
# assign first 20 papers from Madison to training
df_mad_train = df_mad.iloc[:20,:].reset_index().drop(['index'], axis=1)
df_mad_test = df_mad.iloc[20:,:].reset_index().drop(['index'], axis=1)

# assign first 32 papers from Hamilton to training
df_ham_train = df_ham.iloc[:32,:].reset_index().drop(['index'], axis=1)
df_ham_test = df_ham.iloc[32:,:].reset_index().drop(['index'], axis=1)

# assign first 3 papers from Jay to training
df_jay_train = df_jay.iloc[:3,:].reset_index().drop(['index'], axis=1)
df_jay_test = df_jay.iloc[3:,:].reset_index().drop(['index'], axis=1)

train_df_list = [df_mad_train, df_ham_train, df_jay_train]
test_df_list = [df_mad_test, df_ham_test, df_jay_test]
df_train = pd.concat(train_df_list, axis=0).reset_index().drop(['index'], axis=1)
df_test = pd.concat(test_df_list, axis=0).reset_index().drop(['index'], axis=1)
print(df_train.shape)
print(df_test.shape)

(55, 7)
(30, 7)


In [42]:
df_train.head()

Unnamed: 0,No.,Title,Author,Publication,Date,Text,Length
0,10,The Same Subject Continued: The Union as a Saf...,Madison,From the New York Packet,"Friday, November 23, 1787",AMONG the numerous advantages promised by a we...,17835
1,14,Objections to the Proposed Constitution from E...,Madison,From the New York Packet,"Friday, November 30, 1787","WE HAVE seen the necessity of the Union, as ou...",12641
2,18,The Same Subject Continued: The Insufficiency ...,Madison,For the Independent Journal,- -,"AMONG the confederacies of antiquity, the most...",12831
3,19,The Same Subject Continued: The Insufficiency ...,Madison,For the Independent Journal,- -,"THE examples of ancient confederacies, cited i...",12487
4,20,The Same Subject Continued: The Insufficiency ...,Madison,From the New York Packet,"Tuesday, December 11, 1787",THE United Netherlands are a confederacy of re...,9570


### Convert to sentence blocks

In [86]:
print(i)

54


In [84]:
df_train.iloc[i,:]

No.                                                            4
Title          The Same Subject Continued: Concerning Dangers...
Author                                                       Jay
Publication                          For the Independent Journal
Date                                                         - -
Text           MY LAST paper assigned several reasons why the...
Length                                                      9621
Name: 54, dtype: object

In [80]:
#iterate through all documents
for i in range(df_train.shape[0]):   
    
    sentence_list = tokenize.sent_tokenize(df_train['Text'][i])
    sent_count = len(sentence_list)
    
    print(df_train['Author'][i], sent_count)
    for j in range(0, sent_count, 5):
        idx1 = j
        idx2 = j + 5
        if idx2 < sent_count:
            document_chunk_idx = [n for n in range(idx1,idx2)]
            print(document_chunk_idx)

Madison 66
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
[40, 41, 42, 43, 44]
[45, 46, 47, 48, 49]
[50, 51, 52, 53, 54]
[55, 56, 57, 58, 59]
[60, 61, 62, 63, 64]
Madison 45
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
Madison 69
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
[40, 41, 42, 43, 44]
[45, 46, 47, 48, 49]
[50, 51, 52, 53, 54]
[55, 56, 57, 58, 59]
[60, 61, 62, 63, 64]
Madison 48
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
[40, 41, 42, 43, 44]
Madison 33
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21,

Hamilton 54
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
[40, 41, 42, 43, 44]
[45, 46, 47, 48, 49]
Hamilton 31
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
Hamilton 54
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
[40, 41, 42, 43, 44]
[45, 46, 47, 48, 49]
Hamilton 44
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
[25, 26, 27, 28, 29]
[30, 31, 32, 33, 34]
[35, 36, 37, 38, 39]
Jay 30
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]
[20, 21, 22, 23, 24]
Jay 20
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
Jay 25
[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13, 14]
[15, 16, 17, 18, 19]


### Testing scripts/functions

In [29]:
from nltk import tokenize

sentences_per_document = []
sentence_lengths_per_document = []
for i in range(df_mad.shape[0]):  
    sentence_list = tokenize.sent_tokenize(df_mad['Text'][i])
    sentences_per_document.append(sentence_list)
    sentence_lengths_per_document.append(len(sentence_list))
    
print(min(sentence_lengths_per_document), max(sentence_lengths_per_document))
print(sentence_lengths_per_document)

33 129
[66, 45, 69, 48, 33, 59, 102, 69, 73, 129, 69, 114, 70, 51, 69, 80, 51, 55, 37, 61, 58, 67, 62, 54, 45, 71, 63, 65, 65]
