# sai: spooky author identification
## analysis 4: feature importance (mini_test)

## strategy
This will reproduce the dataset provided the work done in the first two.  However, this will work with feature importance.  Feature importance says "what are the most important and predictive features for each author"?  Which words resonate with which author?

## code
### preliminaries
This is the 'de facto' run, where it loads libraries and necessary modules to perform the analysis.  Afterwards, it will read a simple csv file into a dataframe called 'texts.'  

In [1]:
# words = np.array(vectorizer.get_feature_names())

# x = np.eye(xtest.shape[1])                    # identity matrix
# probs = clf.predict_log_proba(x)[:, 0]        # logistics regression probability
# ind = np.argsort(probs)                       # organizes words with probabilities

# good_words = words[ind[:10]]
# bad_words = words[ind[-10:]]

# good_prob = probs[ind[:10]]
# bad_prob = probs[ind[-10:]]

# print("Good words\t     P(fresh | word)")
# for w, p in zip(good_words, good_prob):
#     print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
# print("Bad words\t     P(fresh | word)")
# for w, p in zip(bad_words, bad_prob):
#     print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))`

In [2]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# sklearn
from sklearn.cross_validation import train_test_split             # cross-validation
from sklearn.feature_extraction.text import CountVectorizer       # vectorizer
from sklearn.naive_bayes import MultinomialNB                     # classifier
from sklearn.linear_model import LogisticRegression               # classifier
from sklearn.model_selection import GridSearchCV                  # parameter tuning
from sklearn.pipeline import Pipeline                             # pipeline
from sklearn import metrics                                       # metrics


# other modules
from stop_words import get_stop_words
from pprint import pprint

# Read training texts: texts
texts = pd.read_csv('train.csv')



In [67]:
# get stop words to eliminate unnecessary words
stop_words = get_stop_words('english')

In [30]:
# choose X and y
X = texts.text
y = texts.author

In [31]:
# cross validation
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [68]:
# Experiment 1
# vectorizer 
vect_A1 = CountVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_words, max_df=0.5)

# classifier
nb_A1 = MultinomialNB(alpha=0.1)
log_reg = LogisticRegression(C=2.0)

In [69]:
# fit & transform with vectorizer
X_train_dtm = vect_A1.fit_transform(X_train)

# fit with classifer
nb_A1.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = nb_A1.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A1.transform(X_test)
y_pred_test = nb_A1.predict(X_test_dtm)

In [70]:
# fit with classifer
log_reg.fit(X_train_dtm, y_train)

# predict with classifier
y_pred_train = log_reg.predict(X_train_dtm)

# transform with vectorizer, then predict with classifier
X_test_dtm = vect_A1.transform(X_test)
y_pred_test = log_reg.predict(X_test_dtm)

### Examine for Further Insight

In [71]:
# store vocabulary of X_train
X_train_tokens = vect_A1.get_feature_names()
len(X_train_tokens)

182939

In [72]:
# slice first 50 tokens
print(X_train_tokens[:50])

['aaem', 'ab', 'ab te', 'aback', 'aback breeze', 'aback reading', 'abaft', 'abaft engine', 'abandon', 'abandon considered', 'abandon expedition', 'abandon four', 'abandon hope', 'abandon idea', 'abandon purpose', 'abandon quest', 'abandon search', 'abandon strangers', 'abandoned', 'abandoned abortion', 'abandoned ah', 'abandoned associates', 'abandoned attempts', 'abandoned barn', 'abandoned bitterness', 'abandoned characters', 'abandoned city', 'abandoned dilapidated', 'abandoned gold', 'abandoned granary', 'abandoned halls', 'abandoned house', 'abandoned implicitly', 'abandoned owners', 'abandoned party', 'abandoned pursuit', 'abandoned railway', 'abandoned uncomplaining', 'abandoned utterly', 'abandoned wharves', 'abandoned without', 'abandoning', 'abandoning legitimate', 'abandoning route', 'abandoning terror', 'abandonment', 'abandonment driven', 'abandonment forbidden', 'abandonment impulses', 'abaout']


In [73]:
# slice first 50 tokens
print(X_train_tokens[-50:])

['zigzagged drunkenly', 'zigzagging', 'zigzagging along', 'zigzagging line', 'zimmer', 'zimmer bent', 'zimmer curious', 'zimmer undoubtedly', 'zit', 'zit still', 'zit zide', 'zobna', 'zobna advance', 'zodiac', 'zodiac question', 'zodiacal', 'zodiacal light', 'zokkar', 'zokkar olden', 'zone', 'zone former', 'zone running', 'zorry', 'zuro', 'zuro sate', 'ångstrom', 'ædile', 'ædile mother', 'ægyptus', 'ægyptus cryptic', 'æmilianus', 'æmilianus adds', 'æmilianus cornelius', 'æneid', 'æneid translation', 'ærial', 'ærial navigation', 'æronaut', 'æronaut appear', 'æronaut overhead', 'æronauts', 'æronauts mr', 'ærostation', 'æschylus', 'æschylus fifty', 'élite', 'élite city', 'οἶδα', 'οἶδα know', 'οἶδα οἶδα']


In [74]:
# Naive Bayes counts the number of times each token appears in each class
nb_A1.feature_count_

array([[ 1.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [75]:
# rows represent classes
nb_A1.feature_count_.shape

(3, 182939)

In [76]:
# number of times each token appears across all HAM messages
EAP_token_count = nb_A1.feature_count_[0,:]

In [77]:
# number of times each token appears across all SPAM messages
HPL_token_count = nb_A1.feature_count_[1,:]

In [78]:
MWS_token_count = nb_A1.feature_count_[2,:]

In [79]:
# DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'EAP':EAP_token_count, 'HPL':HPL_token_count, 'MWS':MWS_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,EAP,HPL,MWS
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaem,1.0,0.0,0.0
ab,1.0,0.0,0.0
ab te,1.0,0.0,0.0
aback,2.0,0.0,0.0
aback breeze,1.0,0.0,0.0


In [80]:
# example 5 random DF rows
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,EAP,HPL,MWS
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
saying sleep,1.0,0.0,0.0
tortured can,0.0,0.0,1.0
felt witnessed,0.0,1.0,0.0
thus pampered,0.0,0.0,1.0
forget tale,0.0,0.0,1.0


In [81]:
# Naive Bayes counts the number of observations 
nb_A1.class_count_

array([ 5925.,  4226.,  4533.])

In [82]:
# add 1 to ham and spam counts to avoid dividing by 0 (class imbalance)
tokens['EAP'] = tokens.EAP + 1
tokens['HPL'] = tokens.HPL + 1
tokens['MWS'] = tokens.MWS + 1
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,EAP,HPL,MWS
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
saying sleep,2.0,1.0,1.0
tortured can,1.0,1.0,2.0
felt witnessed,1.0,2.0,1.0
thus pampered,1.0,1.0,2.0
forget tale,1.0,1.0,2.0


In [83]:
# convert ham and spam into frequencies 
tokens['EAP'] = tokens.EAP / nb_A1.class_count_[0]
tokens['HPL'] = tokens.HPL / nb_A1.class_count_[1]
tokens['MWS'] = tokens.MWS / nb_A1.class_count_[2]
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,EAP,HPL,MWS
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
saying sleep,0.000338,0.000237,0.000221
tortured can,0.000169,0.000237,0.000441
felt witnessed,0.000169,0.000473,0.000221
thus pampered,0.000169,0.000237,0.000441
forget tale,0.000169,0.000237,0.000441


In [85]:
# examine DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens['EAP'].sort_values(ascending=False).head(10)

token
upon       0.113586
one        0.072743
now        0.057046
will       0.047089
said       0.043882
little     0.033418
even       0.033080
say        0.032743
well       0.032574
however    0.032574
Name: EAP, dtype: float64