In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("federalist.csv")

In [3]:
#1
df['author'] = df.author.astype('category')
df.head()

Unnamed: 0,author,text
0,HAMILTON,FEDERALIST. No. 1 General Introduction For the...
1,JAY,FEDERALIST No. 2 Concerning Dangers from Forei...
2,JAY,FEDERALIST No. 3 The Same Subject Continued (C...
3,JAY,FEDERALIST No. 4 The Same Subject Continued (C...
4,JAY,FEDERALIST No. 5 The Same Subject Continued (C...


In [4]:
df['author'].value_counts()

HAMILTON                49
MADISON                 15
HAMILTON OR MADISON     11
JAY                      5
HAMILTON AND MADISON     3
Name: author, dtype: int64

In [5]:
#2
x = df.text
y = df.author

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=1234)
print('Shape of x_train : ' + str(x_train.shape))
print('Shape of y_train : ' + str(y_train.shape))
print('Shape of x_test : ' + str(x_test.shape))
print('Shape of y_test : ' + str(y_test.shape))

Shape of x_train : (66,)
Shape of y_train : (66,)
Shape of x_test : (17,)
Shape of y_test : (17,)


In [7]:
#3
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = set(stopwords.words('english'))

In [8]:
def preprocess(text):
    global stopwords
    text.replace('[\d][\d]+', ' num ', regex=True, inplace=True)
    text.replace('[!@#*][!@#*]+', ' punct ', regex=True, inplace=True)
    text.replace('[A-Z][A-Z]+', ' caps ', regex=True, inplace=True)
    return text

In [9]:
x_train = preprocess(x_train)
x_test = preprocess(x_test)

In [10]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [11]:
print('Shape of x_train : ' + str(x_train_vec.shape))
print('Shape of x_test : ' + str(x_test_vec.shape))

Shape of x_train : (66, 7757)
Shape of x_test : (17, 7757)


In [12]:
#4
from sklearn.naive_bayes import BernoulliNB
naive_bayes = BernoulliNB()
naive_bayes.fit(x_train_vec, y_train)

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
pred = naive_bayes.predict(x_test_vec)
print('Accuracy for Bernoulli Naive Bayes: ', accuracy_score(y_test, pred))

accuracy score:  0.5882352941176471


In [14]:
#5
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features = 1000, ngram_range = (1, 2))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [18]:
naive_bayes = BernoulliNB()
naive_bayes.fit(x_train_vec, y_train)
pred = naive_bayes.predict(x_test_vec)
print('Accuracy for Bernoulli Naive Bayes with max features and ngrams: ', accuracy_score(y_test, pred))

Accuracy for Bernoulli Naive Bayes with max features and ngrams:  0.9411764705882353


In [16]:
#6
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

pipe1 = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('logreg', LogisticRegression(multi_class='multinomial', solver='lbfgs',class_weight='balanced')), ## when we add the solver and the class weights in we go from an accuracy of .58 to 1
])

pipe1.fit(x_train, y_train)
pred = pipe1.predict(x_test)
import numpy as np
print("Accuracy for Logistic regression: ", np.mean(pred==y_test))

Accuracy for Logistic regression:  1.0


In [17]:
#7
from sklearn.neural_network import MLPClassifier

pipe1 = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('neuralnet', MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(10, 5), random_state=1234)),
         ])
pipe1.fit(x_train, y_train)
pred = pipe1.predict(x_test)
print("Accuracy for Neural Network regression: ", np.mean(pred==y_test))

Accuracy for Neural Network regression:  0.8823529411764706
