In [97]:
from nltk.corpus import stopwords
import codecs
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC
import time

In [45]:
def read_it(item):
    f=codecs.open(item ,'r')
    f=f.read()
    f=f.replace("\n", " ")
    sent_tokens = sent_tokenize(f)
    tokens = [sent for sent in map(word_tokenize, sent_tokens)]
    list(enumerate(tokens))
    others = '“,”,’,—'
    stopwords_ = set(stopwords.words('english'))
    tokens_lower = [[word.lower() for word in sent] for sent in tokens]
    punctuation_ = set(string.punctuation)
    clean_token= [[word.replace(str(punctuation_),'') for word in sent] for sent in tokens_lower]

    def filter_tokens(sent):
        return([w for w in sent if not w in stopwords_ and not w in punctuation_ and not w in others])

    tokens_filtered = list(map(filter_tokens, clean_token))



    return tokens_filtered

In [159]:
def chunk(x, y = 30):
    l=len(x)
    v=l//y
    remainder=y
    all_series = []
    for loops in range(v+1):
        new_elem = []
        if loops > (v - 1):
            remainder = l%y
        if remainder > 0:
            for i in range(remainder):
                temp = (y * (loops)) + i
                new_elem.extend(x[temp])
            all_series.append(new_elem)
    return pd.Series(all_series)

In [41]:
def make_it_df(item,title,author):
    df = pd.DataFrame()
    df['txt']= chunk(read_it(item))
    df['title']= title
    df['author'] = author
    return df

In [169]:
bell = make_it_df("/Users/andrewargaez/Author_Classifier/WBT.txt",'For Whom the Bell Tolls','Ernest Hemmingway')
kar = make_it_df("/Users/andrewargaez/Author_Classifier/karmaz.txt", "The Brothers Karmazov", "Fyodor Dostoevsky")
gg = make_it_df("/Users/andrewargaez/Author_Classifier/GG.txt", "The Great Gatsby", "F. Scott Fitzgerald")
pp = make_it_df("/Users/andrewargaez/Author_Classifier/GG.txt", "Pride and Prejudice", "Jane Austen")
cp = make_it_df("/Users/andrewargaez/Author_Classifier/CP.txt", "Crime and Punishment", "Fyodor Dostoevsky")
fta = make_it_df("/Users/andrewargaez/Author_Classifier/FTA.txt", "A Farewell to Arms", "Ernest Hemingway")
em = make_it_df("/Users/andrewargaez/Author_Classifier/emma.txt", "Emma", "Jane Austen")
al= make_it_df("/Users/andrewargaez/Author_Classifier/alice.txt", "Alice in Woderland", "Lewis Carrol")
drac= make_it_df("/Users/andrewargaez/Author_Classifier/drac.txt", "Dracula", "Bram Stoker")



In [171]:
new_df =pd.concat([bell, kar,cp,fta, gg, pp,em,al,drac],ignore_index=True)

In [162]:
def squeaky_clean(new_df):
    others = '“,”,’,—,_,.,——'
    vals = list(new_df['txt'].values)
    arr=[]

    for _sent in vals:
        sent=[]
        for word in _sent:
            for char in others:
                word = word.replace(char,'')
            sent.append(word)      
        arr.append(str(sent))
    return pd.Series(arr)

In [172]:
new_df['txt']= squeaky_clean(new_df)

In [164]:
def get_data(item):
    data=item.txt
    labels = item.author
    le = LabelEncoder()
    y = le.fit_transform(labels)
    return np.array(data), np.array(y)

In [165]:
def tune_naive_bayes(data, y):
    print("tuning naive bayes...")
    kfold = KFold(5)
    alphas = np.concatenate((np.arange(0, 0.1, 0.02), np.arange(.1, 1.3, 0.1)))
    scores = defaultdict(list)
    for train_index, test_index in kfold.split(data):
        data_train, data_test = data[train_index], data[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tfidf = TfidfVectorizer()
        X_train = tfidf.fit_transform(data_train)
        print(X_train.shape)
        X_test = tfidf.transform(data_test)
        for alpha in alphas:
            nb = MultinomialNB(alpha=alpha)
            nb.fit(X_train, y_train)
            scores[alpha].append(nb.score(X_test, y_test))

    print("alpha  score")
    for alpha in alphas:
        print(" %.2f  %f" % (alpha, np.average(scores[alpha])))

In [166]:
def run_models(data, y,hold_x,hold_y):
    data_train, data_test, y_train, y_test = train_test_split(data, y)
    

    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(data_train).toarray()
    X_test = tfidf.transform(data_test).toarray()


    print("running models...")
    models = [("Random Forest", RandomForestClassifier()),
              ("Decision Tree", DecisionTreeClassifier()),
              ("kNN", KNeighborsClassifier()),  
              ("Naive Bayes", MultinomialNB()),
              ("SVM", OneVsRestClassifier(SVC())),
              ("Logistic", OneVsRestClassifier(LogisticRegression()))]

    print("%20s %7s %9s %9s" % ("Name", "Score", "TrainTime", "TestTime"))

    for name, model in models:
        start = time.time()
        model.fit(X_train, y_train)
        trained = time.time()
        score = model.score(X_test, y_test)
        tested = time.time()

        # Silly stuff to make it print nicely
        print("%20s   %.3f %9s %9s" % (name, score,
                                       str(round(trained - start, 2)),
                                       str(round(tested - trained, 2))))

In [167]:
datat, yt = get_data(test_df)

In [173]:
data, y = get_data(new_df)
tune_naive_bayes(data, y)
run_models(data, y,datat,yt)

tuning naive bayes...
(1880, 24443)




(1881, 25048)




(1881, 25199)




(1881, 24297)




(1881, 22034)
alpha  score
 0.00  0.522483
 0.02  0.528440
 0.04  0.528015
 0.06  0.526313
 0.08  0.523759
 0.10  0.521632
 0.20  0.493121
 0.30  0.472270
 0.40  0.458653
 0.50  0.454398
 0.60  0.450568
 0.70  0.447589
 0.80  0.447164
 0.90  0.446738
 1.00  0.446313
 1.10  0.446313
 1.20  0.446313




running models...
                Name   Score TrainTime  TestTime
       Random Forest   0.889      7.95      0.08
       Decision Tree   0.854      8.43      0.03
                 kNN   0.942      0.02      0.28
         Naive Bayes   0.633      0.06      0.01
                 SVM   0.939    156.56     97.91
            Logistic   0.920      5.95      0.09
