Import Libraries

In [3]:
import pandas as pd
import string
import requests
import re
from sklearn.neighbors import KNeighborsClassifier


Define readability functions

In [4]:
def remove_punctuation(s):
    return ''.join(c for c in s if c not in string.punctuation)

def syllablesPerWord(word):
    vowels = 'aeiouy'
    diphthongs = ["oi", "oy", "ou", "ow", "ai", "au", "ay", "aw", "oo", "ie", "ea", "ee"]
    word = word.lower()
    syllable_count = 0
    for i, char in enumerate(word):
        if char in vowels:
            if i > 0 and word[i-1:i+1] in diphthongs:
                continue
            syllable_count += 1
    if word.endswith('e') and len(word) > 1 and word[-2] not in vowels:
        syllable_count -= 1
    if len(word) > 2 and word.endswith("le") and word[-3] not in vowels:
        syllable_count += 1
    return max(1, syllable_count)

def getNumberOfTotalWords(s): return len(remove_punctuation(s).split())
def getNumberOfTotalSentences(s): return sum(s.count(c) for c in '.!?')
def getNumberOfTotalSyllables(s): return sum(syllablesPerWord(w) for w in remove_punctuation(s).split())

def flesch_reading_ease(text):
    words = getNumberOfTotalWords(text)
    sentences = getNumberOfTotalSentences(text)
    syllables = getNumberOfTotalSyllables(text)
    return 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)

def flesch_kincaid_grade_level(text):
    words = getNumberOfTotalWords(text)
    sentences = getNumberOfTotalSentences(text)
    syllables = getNumberOfTotalSyllables(text)
    return 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59



Download all of the federalist papers from a outside URL and parse the texts

In [5]:
url = "https://www.gutenberg.org/cache/epub/1404/pg1404.txt"
text = requests.get(url).text



chunks = re.split(r"\nFEDERALIST No\. (\d+)", text)[1:]
essay_data = [(int(chunks[i]), chunks[i+1]) for i in range(0, len(chunks), 2)]
essays_df = pd.DataFrame(essay_data, columns=["Essay_Number", "Text"])

Compute the readability metrics and assign authorship labels

In [6]:
essays_df["Flesch_Reading_Ease"] = essays_df["Text"].apply(flesch_reading_ease)
essays_df["Flesch_Kincaid_Grade"] = essays_df["Text"].apply(flesch_kincaid_grade_level)

def assign_author(n):
    if n in [1,6,7,8,9,11,12,13,15,16,17,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,59,60,61,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85]:
        return 'Hamilton'
    elif n in [10,14,18,19,20,37,38,39,40,41,42,43,44,45,46,47,48]:
        return 'Madison'
    elif n in [2,3,4,5]:
        return 'Jay'
    elif n in range(49,59) or n in [62,63]:
        return 'Disputed'
    else:
        return 'Unknown'

essays_df["Author"] = essays_df["Essay_Number"].apply(assign_author)

Load the 1-gram word frequencies, get the top 20 most frequent words, and merge the text metrics with frequency data

In [7]:

onegram_df = pd.read_csv("lexos_1gram_inALL_prop.csv")
onegram_df = onegram_df[~onegram_df["Unnamed: 0"].isin(["Total", "Average"])].copy()
onegram_df.rename(columns={"Unnamed: 0": "doc_id"}, inplace=True)
onegram_df["Essay_Number"] = onegram_df["doc_id"].str.extract(r"FED_(\d+)_")[0].astype(int)
onegram_df.set_index("Essay_Number", inplace=True)


top_20_words = onegram_df.drop(columns=["doc_id"]).mean().sort_values(ascending=False).head(20).index.tolist()
freqs_df = onegram_df[top_20_words]


full_df = essays_df.merge(freqs_df, left_on="Essay_Number", right_index=True, how="inner")


KNN clasifyer of known authors, and predict authorship

In [8]:
features = ["Flesch_Reading_Ease", "Flesch_Kincaid_Grade"] + top_20_words
train = full_df[full_df["Author"].isin(["Hamilton", "Madison", "Jay"])]
X_train = train[features]
y_train = train["Author"]


disputed = full_df[full_df["Essay_Number"].isin([49,50,51,52,53,54,55,56,57,58,62,63])]
X_test = disputed[features]

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
disputed = disputed.copy()
disputed["Predicted_Author"] = knn.predict(X_test)
disputed.set_index("Essay_Number", inplace=True)


Display the final results

In [9]:
cols = ["Predicted_Author", "Flesch_Reading_Ease", "Flesch_Kincaid_Grade"] + top_20_words
display(disputed[cols].sort_index())

Unnamed: 0_level_0,Predicted_Author,Flesch_Reading_Ease,Flesch_Kincaid_Grade,the,of,to,and,in,a,be,...,which,as,by,this,would,will,or,for,have,not
Essay_Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49,Hamilton,28.430984,16.147942,0.1077,0.0611,0.0351,0.0254,0.0206,0.02,0.0284,...,0.0121,0.0091,0.0091,0.003,0.0133,0.0006,0.006,0.0067,0.003,0.0079
50,Hamilton,27.776467,15.748209,0.0911,0.0541,0.0252,0.0298,0.0252,0.0126,0.028,...,0.0081,0.0099,0.0099,0.0063,0.0099,0.0063,0.0072,0.0054,0.0108,0.009
51,Hamilton,25.16919,17.294727,0.1052,0.0588,0.026,0.0208,0.026,0.0239,0.0328,...,0.0073,0.0146,0.012,0.0068,0.0047,0.012,0.0036,0.0068,0.0016,0.0047
52,Hamilton,25.972167,17.232108,0.1005,0.0546,0.0389,0.02,0.0178,0.0189,0.0249,...,0.0113,0.0086,0.0119,0.0081,0.0043,0.0054,0.0038,0.007,0.0103,0.0059
53,Hamilton,24.573824,17.480552,0.0894,0.059,0.0336,0.0286,0.0207,0.0221,0.0235,...,0.0106,0.0101,0.0143,0.0069,0.0028,0.0115,0.0055,0.0097,0.0032,0.0088
54,Hamilton,29.733111,16.60207,0.1017,0.0584,0.0304,0.019,0.0324,0.0175,0.02,...,0.0125,0.018,0.013,0.0105,0.003,0.0075,0.0045,0.005,0.008,0.009
55,Hamilton,27.964034,17.48999,0.0891,0.0597,0.0382,0.0235,0.0147,0.0235,0.025,...,0.0142,0.0059,0.0069,0.0059,0.0049,0.0098,0.0098,0.0083,0.0029,0.0069
56,Hamilton,26.771506,16.955895,0.0871,0.0712,0.0248,0.0337,0.0197,0.0305,0.0229,...,0.0121,0.0064,0.0064,0.0083,0.0025,0.0184,0.0019,0.0045,0.0051,0.007
57,Hamilton,33.681784,15.443767,0.0976,0.0682,0.0334,0.0244,0.0181,0.0167,0.0194,...,0.0136,0.0108,0.0113,0.0059,0.0027,0.0108,0.0099,0.0086,0.005,0.0068
58,Madison,24.075758,17.873519,0.1019,0.056,0.0292,0.0225,0.0278,0.0244,0.0249,...,0.0129,0.01,0.0105,0.0067,0.0057,0.0177,0.0053,0.0062,0.0086,0.0067


Looks like most of the disputed papers lean toward Hamilton based on this method, with only Paper 58 siding with Madison. That said, this analysis only used readability scores and the top 20 most frequent words — nothing super complex. Still, it’s interesting how even these simple features hint at authorship patterns. It’s definitely not the full story, nor does it agree too strongly with the more widely accepted authorship predictions, but it shows that writing style can be pretty telling, even with just surface-level stats.