### Import requirements

In [6]:
%matplotlib inline

import pandas as pd
pd.set_option('chained_assignment',None)
import numpy as np
import itertools
from collections import Counter
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

import re
import string
import codecs
from nltk.corpus import stopwords

In [None]:
# model prep and evaluation
#from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 
#from sklearn.metrics import confusion_matrix, classification_report

# machine learning algorithms
#from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import GradientBoostingClassifier

### Helper functions

In [2]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

def plot_confusion_matrix(cm, classes, normalize=False, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

### Import data

In [4]:
df_fed = pd.read_csv("../raw_data/federalist.csv")[["Text","Author"]]
df_fed.head()

Unnamed: 0,Text,Author
0,AFTER an unequivocal experience of the ineffic...,Hamilton
1,WHEN the people of America reflect that they a...,Jay
2,IT IS not a new observation that the people of...,Jay
3,MY LAST paper assigned several reasons why the...,Jay
4,"QUEEN ANNE, in her letter of the 1st July, 170...",Jay


### Pre-processing

In [5]:
df_fed_standardized = standardize_text(df_fed, "Text")
df_fed_standardized.head()

Unnamed: 0,Text,Author
0,after an unequivocal experience of the ineffic...,Hamilton
1,when the people of america reflect that they a...,Jay
2,it is not a new observation that the people of...,Jay
3,my last paper assigned several reasons why the...,Jay
4,"queen anne, in her letter of the 1st july, 170...",Jay


### Bag of words cross-validation

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [23]:
X = df_fed_standardized['Text']
y = df_fed_standardized['Author']

skf = StratifiedKFold(n_splits=5)
print(skf)

for train_index, test_index in skf.split(X, y):
    # creates train and test data for each fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(Counter(y_train))
    
    # bag of words vectorization
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_test_counts = count_vect.transform(X_test)
    print(X_train_counts.shape, X_test_counts.shape)
    

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
Counter({'Hamilton': 40, 'Madison': 23, 'Jay': 4})
(67, 7493) (18, 7493)
Counter({'Hamilton': 41, 'Madison': 23, 'Jay': 4})
(68, 7742) (17, 7742)
Counter({'Hamilton': 41, 'Madison': 23, 'Jay': 4})
(68, 7981) (17, 7981)
Counter({'Hamilton': 41, 'Madison': 23, 'Jay': 4})
(68, 8067) (17, 8067)
Counter({'Hamilton': 41, 'Madison': 24, 'Jay': 4})
(69, 7995) (16, 7995)


In [24]:
import imblearn

ModuleNotFoundError: No module named 'imblearn'