### Spooky Author Identification
**Inspiration**: dataschool.io, Kevin Markham

**Aim**: get the gist of this dataset using his method

In [70]:
### Import libraries and modules
import pandas as pd # dataframes
import numpy as np
from sklearn import metrics

### Read in the data

# read our data into a dataframe
texts = pd.read_csv("train.csv")

In [2]:
### Examine the file
# First records
texts.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
# Shape texts
texts.shape

(19579, 3)

In [4]:
# Value Counts
texts.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [6]:
### Define X and y, train and test
X = texts.text
y = texts.author

In [9]:
type(X)
type(y)

pandas.core.series.Series

In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14684,)
(4895,)
(14684,)
(4895,)




In [11]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [12]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
# examine the fitted vocabulary
vect.get_feature_names()

['aaem',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abaout',
 'abashed',
 'abate',
 'abated',
 'abatement',
 'abbey',
 'abbreviation',
 'abbé',
 'abdicated',
 'abdication',
 'abdul',
 'abernethy',
 'aberrancy',
 'aberrant',
 'aberration',
 'aberrations',
 'abeyance',
 'abhor',
 'abhorred',
 'abhorrence',
 'abhorrent',
 'abide',
 'abigail',
 'abijah',
 'abilities',
 'ability',
 'abject',
 'ablaze',
 'able',
 'ably',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormally',
 'aboard',
 'abode',
 'abodes',
 'abolished',
 'abominable',
 'abomination',
 'abominations',
 'aboriginal',
 'abortion',
 'abortions',
 'abortive',
 'about',
 'above',
 'abra',
 'abreast',
 'abroad',
 'abrupt',
 'abruptly',
 'abruptness',
 'absconded',
 'absence',
 'absences',
 'absense',
 'absent',
 'absolute',
 'absolutely',
 'absolved',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbingly',
 'absorption',
 'abstained',
 'abstaining',
 'abstemious',
 'abstract',
 'abstracted',
 '

In [14]:
X_train_dtm = vect.transform(X_train)

In [15]:
X_test_dtm = vect.transform(X_test)

In [16]:
X_train_dtm.shape, X_test_dtm.shape

((14684, 22328), (4895, 22328))

In [19]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [21]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 42.1 ms, sys: 2.63 ms, total: 44.7 ms
Wall time: 42.9 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
# predict with the test case
y_pred_class = nb.predict(X_test_dtm)

In [23]:
# metrics review
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.83575076608784471

In [24]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

EAP    1976
MWS    1491
HPL    1428
Name: author, dtype: int64

In [31]:
# null accuracy: the classification accuracy where it predicts the most populated class
nacc = y_test.value_counts().head(1)/len(y_test)
nacc

EAP    0.403677
Name: author, dtype: float64

In [32]:
# confusion matrix
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)

[[1651  110  215]
 [ 183 1143  102]
 [ 139   55 1297]]


In [33]:
## Print the classification report
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.84      0.84      0.84      1976
        HPL       0.87      0.80      0.84      1428
        MWS       0.80      0.87      0.84      1491

avg / total       0.84      0.84      0.84      4895



### TF-IDF Weighting for Term Importance
The following are inspired by CS109 Naive Bayes Mini-Project.

In [34]:
# instantiate estimator
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer = TfidfVectorizer()

In [41]:
Xtfidf_train=tfidfvectorizer.fit_transform(X_train)

In [42]:
Xtfidf_test=tfidfvectorizer.transform(X_test)

In [43]:
Xtfidf_train.shape, Xtfidf_test.shape

((14684, 22328), (4895, 22328))

In [44]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [45]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(Xtfidf_train, y_train)

CPU times: user 42 ms, sys: 3.01 ms, total: 45 ms
Wall time: 43.4 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
# predict with the test case
y_pred_class = nb.predict(Xtfidf_test)

In [47]:
metrics.accuracy_score(y_test, y_pred_class)

0.80163432073544438

In [51]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
confusion

array([[1819,   28,  129],
       [ 420,  913,   95],
       [ 277,   22, 1192]])

In [54]:
crpt = metrics.classification_report(y_test, y_pred_class)
print(crpt)

             precision    recall  f1-score   support

        EAP       0.72      0.92      0.81      1976
        HPL       0.95      0.64      0.76      1428
        MWS       0.84      0.80      0.82      1491

avg / total       0.82      0.80      0.80      4895



### Latent Dirichlet Allocation (LDA)

In [56]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=20, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(Xtfidf_train)

In [57]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx), " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [58]:
no_top_words = 10
tf_feature_names = tfidfvectorizer.get_feature_names()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0: seamen shouldn weedy surrender timber cuttings conquering osborne growled caked
Topic 1: crumpled abbey horrendum monstrum messages delightfully soiled gutters wordsworth unprincipled
Topic 2: messrs religion piously silva ricci turkey comeagain voluble averse ninny
Topic 3: inaccessible referring profundity vastness entities cryptograph dipped venus flows gaieties
Topic 4: personage zadok tumult requires possessor sledge dismay defeat preceded await
Topic 5: gather unquestionably oversight scant vase italians bawling delphinus management bust
Topic 6: yxu independence drowsiness unfavourable constrained consultations sx jxhn dxn gx
Topic 7: sonnets shakespeare shakspeare sham jovial ut magus pugnacious procuring occupant
Topic 8: july obeyed judgment retreated sympathize rings merry filling crucifix limitless
Topic 9: dismiss drunk bowers perfumes everlastingly britannica encyclopædia pundita needn queerer
Topic 10: appearances infirmity kings acknowledge working overboard oc

In [59]:
print(lda.doc_topic_prior_)

0.05


In [62]:
lda.fit(Xtfidf_train)
lda.components_

array([[ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003],
       [ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000002,  0.05000003],
       [ 0.05000003,  0.05000002,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003],
       ..., 
       [ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000002,  0.05000002],
       [ 0.05000004,  0.05000003,  0.05000003, ...,  0.05000002,
         0.05000003,  0.05000003],
       [ 0.05000003,  0.05000003,  0.05000003, ...,  0.05000003,
         0.05000003,  0.05000003]])

In [71]:
data_train = lda.transform(Xtfidf_train)
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
data_test = lda.transform(Xtfidf_test)

In [73]:
clf = MultinomialNB().fit(data_train, y_train)

#your turn. Print the accuracy on the test and training dataset
training_accuracy = clf.score(data_train, y_train)
test_accuracy = clf.score(data_test, y_test)

print("Accuracy on training data: {:2f}".format(training_accuracy))
print("Accuracy on test data:     {:2f}".format(test_accuracy))

Accuracy on training data: 0.403432
Accuracy on test data:     0.403677
