# Notebook 03 - Create Datasets

### Import requirements

In [1]:
from nltk import tokenize
from collections import Counter
import pandas as pd
pd.set_option('chained_assignment',None)

### Read in raw data

In [2]:
df_mad = pd.read_csv("../raw_data/Madison.csv")
df_ham = pd.read_csv("../raw_data/Hamilton.csv")
df_jay = pd.read_csv("../raw_data/Jay.csv")

### Helper functions

In [3]:
def convert_to_text_blocks(dataframe, block_size):
    """
    Splits documents into blocks of sentences and returns as a dataframe.
    Parameters:
        dataframe: a dataframe containing document-level text
        block_size: the number of consecutive sentences per block
    Returns:
        dataframe_text_blocks: a new dataframe containing text at
        the level of blocks of text rather than document-level
    """
    # initialize output data dictionary
    block_dict = {'No.':[], 'Author':[], 'Text Block':[]}
    #iterate through all documents
    for i in range(dataframe.shape[0]):
        author = dataframe['Author'][i]
        no = dataframe['No.'][i]
        sentence_list = tokenize.sent_tokenize(dataframe['Text'][i])
        sentence_count = len(sentence_list)
        # iterate through sentences in each document and create blocks
        for j in range(0, sentence_count, block_size):
            idx1 = j
            idx2 = j + block_size
            if idx2 <= sentence_count:
                doc_block_list = sentence_list[idx1:idx2]
                doc_block = ' '.join(doc_block_list)
                #collect information
                block_dict['No.'].append(no)
                block_dict['Author'].append(author)
                block_dict['Text Block'].append(doc_block)
    # create output dataframe
    dataframe_text_blocks = pd.DataFrame(block_dict)
    return dataframe_text_blocks

### Train/Test split
Many of the Federalist Papers are multi-part installments or continuations of earlier subjects. This could cause problems when it comes to train/test splitting because we want our classifier to identify authorship and not similar thematic content. Accordingly, selected the indices for splitting the datasets that ensure that continuations of the same subject were not assigned to both training and testing sets.

This approach is only to be used for the first version of the model. Later approaches will use additional writing samples scraped from the National Archives in order ot make authorship predictions on the Federalist Papers.

### Split data at document level

In [4]:
# assign first 20 papers from Madison to training
df_mad_train = df_mad.iloc[:20,:].reset_index().drop(['index'], axis=1)
df_mad_test = df_mad.iloc[20:,:].reset_index().drop(['index'], axis=1)

# assign first 32 papers from Hamilton to training
df_ham_train = df_ham.iloc[:32,:].reset_index().drop(['index'], axis=1)
df_ham_test = df_ham.iloc[32:,:].reset_index().drop(['index'], axis=1)

# assign first 3 papers from Jay to training
df_jay_train = df_jay.iloc[:3,:].reset_index().drop(['index'], axis=1)
df_jay_test = df_jay.iloc[3:,:].reset_index().drop(['index'], axis=1)

# concatenate into training and testing datasets
train_df_list = [df_mad_train, df_ham_train, df_jay_train]
test_df_list = [df_mad_test, df_ham_test, df_jay_test]
df_train_docs = pd.concat(train_df_list, axis=0).reset_index().drop(['index'], axis=1)
df_test_docs = pd.concat(test_df_list, axis=0).reset_index().drop(['index'], axis=1)
print("Training Set Documents:", df_train_docs.shape[0])
print("Testing Set Documents:", df_test_docs.shape[0])

Training Set Documents: 55
Testing Set Documents: 30


### Convert to sentence blocks

In [5]:
df_train_blocks = convert_to_text_blocks(df_train_docs, 5)
df_test_blocks = convert_to_text_blocks(df_test_docs, 5)

# training data
print(df_train_blocks.shape)
print(Counter(df_train_blocks['Author']))
display(df_train_blocks.head())

# testing data
print(df_test_blocks.shape)
print(Counter(df_test_blocks['Author']))
display(df_test_blocks.head())

(578, 3)
Counter({'Hamilton': 302, 'Madison': 261, 'Jay': 15})


Unnamed: 0,No.,Author,Text Block
0,10,Madison,AMONG the numerous advantages promised by a we...
1,10,Madison,Complaints are everywhere heard from our most ...
2,10,Madison,But it could not be less folly to abolish libe...
3,10,Madison,From the protection of different and unequal f...
4,10,Madison,"Those who are creditors, and those who are deb..."


(336, 3)
Counter({'Hamilton': 216, 'Madison': 107, 'Jay': 13})


Unnamed: 0,No.,Author,Text Block
0,52,Madison,FROM the more general inquiries pursued in the...
1,52,Madison,To have left it open for the occasional regula...
2,52,Madison,"The qualifications of the elected, being less ..."
3,52,Madison,First. As it is essential to liberty that the ...
4,52,Madison,"The scheme of representation, as a substitute ..."


### Save dataframes to files

In [6]:
df_train_blocks.to_csv("../processed_data/train_blocks.csv", index=False)
df_test_blocks.to_csv("../processed_data/test_blocks.csv", index=False)