### Spooky Author Identification
**Inspiration**: dataschool.io, Kevin Markham

**Aim**: get the gist of this dataset using his method

### preliminaries
Import the libraries, modules, and data.

In [2]:
### Import libraries and modules
import pandas as pd # dataframes
import numpy as np
from sklearn import metrics

### Read in the data

# read our data into a dataframe
texts = pd.read_csv("train.csv")

In [3]:
### Examine the file
# First records
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
# Shape texts
texts.shape

(19579, 3)

In [5]:
# Value Counts
texts.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

### feature extraction
Define the training set, cross validate by splitting the data first.

In [6]:
### Define X and y, train and test
X = texts.text
y = texts.author

In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [9]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
# examine the fitted vocabulary
vect.get_feature_names()

['aaem',
 'ab',
 'aback',
 'abaft',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abaout',
 'abasement',
 'abashed',
 'abashment',
 'abate',
 'abated',
 'abatement',
 'abating',
 'abbey',
 'abbreviation',
 'abbé',
 'abdicated',
 'abdication',
 'abdications',
 'abdul',
 'abernethy',
 'aberrancy',
 'aberrant',
 'aberration',
 'aberrations',
 'abeyance',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abigail',
 'abilities',
 'ability',
 'abject',
 'abjure',
 'ablaze',
 'able',
 'ably',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'aboard',
 'abode',
 'abodes',
 'abolished',
 'abominable',
 'abomination',
 'abominations',
 'aboriginal',
 'abortion',
 'abortions',
 'abortive',
 'aboundingly',
 'abounds',
 'about',
 'above',
 'abra',
 'abreast',
 'abroad',
 'abrupt',
 'abruptly',
 'abruptness',
 'absence',
 'absences',
 'absense',
 'absent',
 'absolute',
 'absolutely',
 'absolved',
 'absorb',
 'absorbed',
 'absorbing',
 'absorption',
 'abstaining',
 'abs

In [11]:
X_train_dtm = vect.transform(X_train)

In [12]:
X_test_dtm = vect.transform(X_test)

In [13]:
X_train_dtm.shape, X_test_dtm.shape

((14684, 22566), (4895, 22566))

### machine learning / process
#### naive bayes classifier

In [81]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [82]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 42 ms, sys: 2.69 ms, total: 44.7 ms
Wall time: 43.5 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [83]:
# predict with the test case
y_pred_class = nb.predict(X_test_dtm)

In [84]:
# metrics review
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.83186925434116443

In [85]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

EAP    1994
MWS    1532
HPL    1369
Name: author, dtype: int64

In [86]:
# null accuracy: the classification accuracy where it predicts the most populated class
nacc = y_test.value_counts().head(1)/len(y_test)
nacc

EAP    0.407354
Name: author, dtype: float64

In [87]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)

[[1653  115  226]
 [ 174 1102   93]
 [ 149   66 1317]]


In [88]:
## print classification report
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.84      0.83      0.83      1994
        HPL       0.86      0.80      0.83      1369
        MWS       0.81      0.86      0.83      1532

avg / total       0.83      0.83      0.83      4895



#### logistics regression
Inspired by Naive Bayes mini-project. 

In [89]:
# import and instantiate
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [90]:
# train the model using X_train_dtm
# Naive Bayes can't take a negative number; logreg can!
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 3.66 s, sys: 51.6 ms, total: 3.72 s
Wall time: 955 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [91]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [92]:
# calculate predicted probabilities for X_test_dtm (well-calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:,1]
y_pred_prob

array([ 0.14369324,  0.05195704,  0.00575021, ...,  0.00537358,
        0.29081815,  0.01282771])

In [93]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.82410623084780388

In [94]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)

[[1736  113  145]
 [ 227 1062   80]
 [ 214   82 1236]]


In [95]:
## print classification report
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.80      0.87      0.83      1994
        HPL       0.84      0.78      0.81      1369
        MWS       0.85      0.81      0.83      1532

avg / total       0.83      0.82      0.82      4895



#### random forest classifier 
Recommended by mentor, AJ Sanchez, to check performance of model prediction via Random Forest.

In [96]:
from sklearn.ensemble import RandomForestClassifier
rfst = RandomForestClassifier()

In [97]:
%time rfst.fit(X_train_dtm, y_train)

CPU times: user 2.91 s, sys: 11.9 ms, total: 2.92 s
Wall time: 2.92 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [98]:
# make class predictions for X_test_dtm
y_pred_class = rfst.predict(X_test_dtm)

In [99]:
# calculate predicted probabilities for X_test_dtm (well-calibrated)
y_pred_prob = rfst.predict_proba(X_test_dtm)[:,1]

In [100]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.62022471910112364

In [101]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)

[[1678  144  172]
 [ 640  580  149]
 [ 615  139  778]]


In [102]:
## print classification report
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.57      0.84      0.68      1994
        HPL       0.67      0.42      0.52      1369
        MWS       0.71      0.51      0.59      1532

avg / total       0.64      0.62      0.61      4895



#### TF-IDF Weighting for Term Importance
The following are inspired by CS109 Naive Bayes Mini-Project.

In [103]:
# instantiate estimator
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer()

In [104]:
Xtfidf_train=tfidfvectorizer.fit_transform(X_train)

In [105]:
Xtfidf_test=tfidfvectorizer.transform(X_test)

In [106]:
Xtfidf_train.shape, Xtfidf_test.shape

((14684, 22566), (4895, 22566))

In [107]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [108]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(Xtfidf_train, y_train)

CPU times: user 42.1 ms, sys: 2.75 ms, total: 44.8 ms
Wall time: 43.2 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [109]:
# predict with the test case
y_pred_class = nb.predict(Xtfidf_test)

In [110]:
metrics.accuracy_score(y_test, y_pred_class)

0.81470888661899898

In [111]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
confusion

array([[1841,   44,  109],
       [ 373,  922,   74],
       [ 276,   31, 1225]])

In [112]:
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.74      0.92      0.82      1994
        HPL       0.92      0.67      0.78      1369
        MWS       0.87      0.80      0.83      1532

avg / total       0.83      0.81      0.81      4895



#### Latent Dirichlet Allocation (LDA)

In [113]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=20, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(Xtfidf_train)

In [114]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx), " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [115]:
no_top_words = 10
tf_feature_names = tfidfvectorizer.get_feature_names()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0: est snake malitia vetus ta faire vous histoire tickle digne
Topic 1: dumas unlock etienne arched bawling corroborated throttled monstrum unprincipled horrendum
Topic 2: assumption assemblage herein impassioned distempered accusation hazarded coaches olathoë clump
Topic 3: selected scale du barrière roule accepted heaps nostrils improved defend
Topic 4: archaic squire moonlit vanity outfit gables disputing draft chairman esteemed
Topic 5: particulars unquestionably african picking calculating thunders rescued drift elementary pregnant
Topic 6: heh apparatus personally elastic rattled kepler matured referring exceed vapour
Topic 7: torture non principal mockery create martense yog sothoth ef disturbance
Topic 8: abruptly fierce goosetherumfoodle drum murders hum waning rowdy dow beds
Topic 9: the of and to in was my that it he
Topic 10: diddler sang il dante bedside heroic belong prayed qui keeper
Topic 11: wal aspects wretchedly overspread hez hearse spallanzani watson landaff 

In [116]:
print(lda.doc_topic_prior_)

0.05


In [117]:
lda.fit(Xtfidf_train)
lda.components_

array([[ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003],
       [ 0.05000003,  0.05000002,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003],
       [ 0.05000002,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000002],
       ..., 
       [ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003],
       [ 0.05000003,  0.05000003,  0.05000007, ...,  0.05000003,
         0.05000002,  0.05000003],
       [ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003]])

In [118]:
data_train = lda.transform(Xtfidf_train)
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
data_test = lda.transform(Xtfidf_test)

In [119]:
clf = MultinomialNB().fit(data_train, y_train)

#your turn. Print the accuracy on the test and training dataset
training_accuracy = clf.score(data_train, y_train)
test_accuracy = clf.score(data_test, y_test)

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))

Accuracy on training data: 0.402206
Accuracy on test data:     0.407354
