In [91]:
import json
import pandas as pd
import nltk
from collections import Counter 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report as report 
from sklearn.model_selection import train_test_split as split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder 


In [3]:
raw_train = pd.read_csv("train.csv")
raw_train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [14]:
from nltk.corpus import stopwords, wordnet
lemmatizer = WordNetLemmatizer()
def wordnet_get(tagged_tokens):
    """Helper function for normalizing wordnet labels
    """
    out_tokens = []
    for token in tagged_tokens:
        if token[1].startswith("J"):
            out_token = (token[0], wordnet.ADJ)
        elif token[1].startswith("V"):
            out_token = (token[0], wordnet.VERB)
        elif token[1].startswith("R"):
            out_token = (token[0], wordnet.ADV)
        else:
            out_token = (token[0], wordnet.NOUN)
        out_tokens.append(out_token)
    return out_tokens

def clean_text(string, lemmatizer=lemmatizer):
    """Cleans the text by tokenizing, performing POS tagging, and lemmatizing it 
    """
    tokens = nltk.word_tokenize(string)
    pos_tagged = wordnet_get(nltk.pos_tag(tokens))
    lemmas = [lemmatizer.lemmatize(token[0], pos=token[1]).lower() for token in pos_tagged]
    return lemmas


min_clean = [" ".join(clean_text(x)) for x in raw_train["text"]]
print(min_clean[0])

this process , however , afford me no mean of ascertain the dimension of my dungeon ; a i might make it circuit , and return to the point whence i set out , without be aware of the fact ; so perfectly uniform seem the wall .


In [68]:
# try a simple Naive Bayes with just the minimal cleaning 
nb_params = { 
          "count_vec__ngram_range": [(1, 1), (1, 2), (1, 3)], 
          "count_vec__max_df": [1.0, .85, .5],
          "count_vec__stop_words": [None, "english"],
          "count_vec__binary": [True, False],
          "count_vec__norm": ["l1", "l2", None],
          "count_vec__use_idf": [False, True]} 

nb_estimators = [("count_vec", TfidfVectorizer()), 
              ("NB", MultinomialNB())]
nb_model = Pipeline(nb_estimators)
nb_grid = GridSearchCV(estimator=nb_model, param_grid = nb_params)
nb_grid.fit(min_clean, raw_train["author"]) 
print(nb_grid.best_params_)

{'count_vec__binary': False, 'count_vec__max_df': 1.0, 'count_vec__ngram_range': (1, 2), 'count_vec__norm': None, 'count_vec__stop_words': None, 'count_vec__use_idf': True}


In [71]:
nb_estimators = [("count_vectorizer", TfidfVectorizer(max_df=1.0, ngram_range=(1, 2), norm=None, stop_words=None)), 
                  ("NB", MultinomialNB())]
clf1 = Pipeline(nb_estimators)
scores = cross_val_score(clf1, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), "Naive Bayes"))

Accuracy: 0.84 (+/- 0.00) [Naive Bayes]


In [73]:
lr_params = { 
          "count_vec__ngram_range": [(1, 1), (1, 2)],
          "count_vec__binary": [True, False],
          "count_vec__norm": [None],
          "count_vec__use_idf": [False, True]} 

lr_estimators = [("count_vec", TfidfVectorizer()), 
              ("LR", LogisticRegression())]
lr_model = Pipeline(lr_estimators)
lr_grid = GridSearchCV(estimator=lr_model, param_grid = lr_params)
lr_grid.fit(min_clean, raw_train["author"]) 
print(lr_grid.best_params_)

{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'count_vec__norm': None, 'count_vec__use_idf': True}


In [74]:
lr_estimators1 = [("count_vectorizer", TfidfVectorizer(max_df=1.0, ngram_range=(1, 2), stop_words=None,
                                                      binary=True, use_idf=True, norm=None)), 
                  ("LR", LogisticRegression())]
clf2 = Pipeline(lr_estimators1)
scores2 = cross_val_score(clf2, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores2.mean(), scores2.std(), "Logistic Regression"))

Accuracy: 0.82 (+/- 0.00) [Logistic Regression]


In [75]:
# Try an SVM with a Tfidf Matrix 
svm_params = { 
          "count_vec__ngram_range": [(1, 1), (1, 2)],
          "count_vec__binary": [True, False],
          "count_vec__norm": [None],
          "count_vec__use_idf": [False, True]}

svm_estimators = [("count_vec", TfidfVectorizer()), 
              ("SVM", LinearSVC())]
svm_model = Pipeline(svm_estimators)
svm_grid = GridSearchCV(estimator=svm_model, param_grid = svm_params)
svm_grid.fit(min_clean, raw_train["author"]) 
print(svm_grid.best_params_)svm_estimators1 = [("count_vectorizer", TfidfVectorizer(max_df=1.0, ngram_range=(1, 2), stop_words=None)), 
                  ("SVM", LinearSVC())]
clf3 = Pipeline(svm_estimators1)
scores3 = cross_val_score(clf3, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores3.mean(), scores3.std(), "SVM"))

{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'count_vec__norm': None, 'count_vec__use_idf': True}


In [81]:
svm_estimators1 = [("count_vectorizer", TfidfVectorizer(max_df=1.0, ngram_range=(1, 2), stop_words=None)), 
                  ("SVM", LinearSVC())]
clf3 = Pipeline(svm_estimators1 )
scores3 = cross_val_score(clf3, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores3.mean(), scores3.std(), "SVM"))

Accuracy: 0.83 (+/- 0.01) [SVM]


In [130]:
# try adding in the pos tags
def get_pos(string):
    tokens = nltk.word_tokenize(string)
    pos_tagged = nltk.pos_tag(tokens)
    return [x[1] for x in pos_tagged]


pos_only = [" ".join(get_pos(x)) for x in raw_train["text"]]
print(pos_only[0])


DT NN , RB , VBD PRP DT NNS IN VBG DT NNS IN PRP$ NN : IN PRP MD VB PRP$ NN , CC NN TO DT NN NN PRP VBP RP , IN VBG JJ IN DT NN : RB RB JJ VBD DT NN .


In [49]:
# pos_plus_words = [" ".join(x[1], x[2]) for x in zip(min_clean, pos_only)]
pos_plus_words = [" ".join(x) for x in zip(min_clean, pos_only)]
print(pos_plus_words[0])

this process , however , afford me no mean of ascertain the dimension of my dungeon ; a i might make it circuit , and return to the point whence i set out , without be aware of the fact ; so perfectly uniform seem the wall . DT NN , RB , VBD PRP DT NNS IN VBG DT NNS IN PRP$ NN : IN PRP MD VB PRP$ NN , CC NN TO DT NN NN PRP VBP RP , IN VBG JJ IN DT NN : RB RB JJ VBD DT NN .


In [50]:
nb_params2 = {"count_vec__min_df": [1, 3, 10],
              "count_vec__use_idf":[True, False]
          "count_vec__ngram_range": [(1, 1), (1, 2), (1, 3), (3, 3)], 
          "count_vec__max_df": [1.0, .5]} 

nb_estimators2 = [("count_vec", CountVectorizer()), 
              ("NB", MultinomialNB())]
nb_model2 = Pipeline(nb_estimators2)
nb_grid2 = GridSearchCV(estimator=nb_model2, param_grid = nb_params2)
nb_grid2.fit(pos_plus_words, raw_train["author"]) 
print(nb_grid2.best_params_)
          "count_vec__ngram_range": [(1, 1), (1, 2), (1, 3), (3, 3)], 
          "count_vec__max_df": [1.0, .5]} 

nb_estimators2 = [("count_vec", CountVectorizer()), 
              ("NB", MultinomialNB())]
nb_model2 = Pipeline(nb_estimators2)
nb_grid2 = GridSearchCV(estimator=nb_model2, param_grid = nb_params2)
nb_grid2.fit(pos_plus_words, raw_train["author"]) 
print(nb_grid2.best_params_)

{'count_vec__max_df': 0.5, 'count_vec__min_df': 1, 'count_vec__ngram_range': (1, 1)}


In [51]:
# nb_2
nb_estimators2 = [("count_vectorizer", CountVectorizer(max_df=.5, ngram_range=(1, 1), stop_words=None)), 
                  ("NB", MultinomialNB())]
clf4 = Pipeline(nb_estimators2)
scores4 = cross_val_score(clf4, pos_plus_words, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores4.mean(), scores4.std(), "Naive Bayes"))

Accuracy: 0.83 (+/- 0.00) [Naive Bayes]


In [86]:
svm_estimators2 = [("count_vectorizer", TfidfVectorizer(max_df=1.0, ngram_range=(1, 2), stop_words=None)), 
                  ("SVM", LinearSVC())]
clf5 = Pipeline(svm_estimators2)
scores5 = cross_val_score(clf5, pos_plus_words, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores5.mean(), scores5.std(), "SVM"))

Accuracy: 0.83 (+/- 0.00) [SVM]


In [56]:
lr_params2 = {"count_vec__min_df": [1, 3, 10], 
          "count_vec__ngram_range": [(1, 1), (1, 2), (1, 3), (3, 3)], 
          "count_vec__max_df": [1.0, .75, .5]} 

lr_estimators2 = [("count_vec", CountVectorizer()), 
              ("LR", LogisticRegression())]
lr_model2 = Pipeline(lr_estimators2)
lr_grid2 = GridSearchCV(estimator=lr_model2, param_grid = lr_params2)
lr_grid2.fit(pos_plus_words, raw_train["author"]) 
print(lr_grid2.best_params_)

{'count_vec__max_df': 1.0, 'count_vec__min_df': 1, 'count_vec__ngram_range': (1, 1)}


In [58]:
lr_estimators2 = [("count_vectorizer", CountVectorizer(max_df=1, ngram_range=(1, 1), stop_words=None)), 
                  ("LR", LogisticRegression())]
clf6 = Pipeline(lr_estimators2)
scores6 = cross_val_score(clf6, pos_plus_words, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores6.mean(), scores6.std(), "LR"))

Accuracy: 0.46 (+/- 0.00) [LR]


In [92]:
# Adaboost
ada_params = { 
          "count_vec__ngram_range": [(1, 1), (1, 2)],
          "count_vec__binary": [True, False]} 

ada_estimators = [("count_vec", CountVectorizer()), 
              ("Ada", AdaBoostClassifier())]
ada_model = Pipeline(ada_estimators)
ada_grid = GridSearchCV(estimator=ada_model, param_grid = ada_params)
ada_grid.fit(pos_plus_words, raw_train["author"]) 
print(ada_grid.best_params_)

{'count_vec__binary': False, 'count_vec__ngram_range': (1, 2)}


In [100]:
ada_estimators = [("count_vectorizer", CountVectorizer(max_df=1.0, ngram_range=(1, 2), stop_words=None)), 
                  ("Ada", AdaBoostClassifier())]
clf7 = Pipeline(ada_estimators)
scores7 = cross_val_score(clf7, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores7.mean(), scores7.std(), "Ada"))

Accuracy: 0.60 (+/- 0.00) [Ada]


In [96]:
# Random Forest
rf_params = { 
          "count_vec__ngram_range": [(1, 1), (1, 2)],
          "count_vec__binary": [True, False]} 

rf_estimators = [("count_vec", CountVectorizer()), 
              ("RF", RandomForestClassifier())]
rf_model = Pipeline(rf_estimators)
rf_grid = GridSearchCV(estimator=rf_model, param_grid = rf_params)
rf_grid.fit(pos_plus_words, raw_train["author"]) 
print(rf_grid.best_params_)

{'count_vec__binary': True, 'count_vec__ngram_range': (1, 1)}


In [120]:
rf_estimators = [("count_vectorizer", CountVectorizer(max_df=1.0, ngram_range=(1, 1), binary=True, stop_words=None)), 
                  ("RF", RandomForestClassifier())]
clf8 = Pipeline(rf_estimators)
scores8 = cross_val_score(clf8, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores8.mean(), scores8.std(), "RF"))

Accuracy: 0.62 (+/- 0.01) [RF]


In [129]:
clf4 = Pipeline(nb_estimators2)
# training
clf_batch_1 = [clf1, clf2, clf7, clf8, clf4]
clf_batch_2 = [clf1, clf2, clf3]
eclf = EnsembleVoteClassifier(clfs=clf_batch_1, weights=[1, 1, .5, .5, 1], voting="soft")
ensemble_scores = cross_val_score(eclf, min_clean, raw_train["author"], 
                                              cv=3, 
                                              scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (ensemble_scores.mean(), ensemble_scores.std(), "Ensemble"))

Accuracy: 0.85 (+/- 0.00) [Ensemble]


In [133]:
eclf = EnsembleVoteClassifier(clfs=clf_batch_1, weights=[1, 1, 1, .25, .25, 1])
eclf.fit(min_clean, raw_train["author"])

EnsembleVoteClassifier(clfs=[Pipeline(steps=[('count_vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=None, preprocesso...nizer=None, vocabulary=None)), ('NB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])],
            refit=True, verbose=0, voting='hard',
            weights=[1, 1, 1, 0.25, 0.25])

In [112]:
# read in test data
raw_test = pd.read_csv("test.csv")
raw_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [116]:
min_clean_test = [" ".join(clean_text(x)) for x in raw_test["text"]]
print(min_clean_test[0])

still , a i urge our leave ireland with such inquietude and impatience , my father think it best to yield .


In [134]:
predictions = eclf.predict(min_clean_test)
print(predictions[0])

MWS


In [135]:
output_pres = eclf.predict_proba(min_clean_test)
print(output_pres[0])

[ 0.11357251  0.09375334  0.79267416]
