In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import statsmodels.api as sm
from sklearn import metrics as skmetrics

2017-07-07 23:39:23,751 : INFO : 'pattern' package found; tag filters are available for English


### Verify model with personality

In [2]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')

In [3]:
df.shape

(1039, 186)

In [4]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [5]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

### Evaluation code

In [6]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    target_names = ['no', 'yes']
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [7]:
def evaluate_prediction(predictions, target, title="Confusion matrix"):
    print('accuracy %s' % accuracy_score(target, predictions))
    cm = confusion_matrix(target, predictions)
    print('confusion matrix\n %s' % cm)
    print('(row=expected, col=predicted)')
    
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(cm_normalized, title + ' Normalized')

In [8]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = int(data['extraversion'])
    evaluate_prediction(predictions, target)

## Doc2Vec

In [9]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [10]:
def tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [11]:
train_tagged = train_w2v_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['formatted_text']), tags=[int(r.extraversion)]), axis=1)

In [12]:
test_tagged = test_w2v_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['formatted_text']), tags=[int(r.extraversion)]), axis=1)

In [13]:
%%time
trainsent = train_tagged.values
testsent = test_tagged.values

# simple gensim doc2vec api
doc2vec_model = Doc2Vec(trainsent, workers=1, size=5, iter=20, dm=1)

train_targets, train_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in trainsent])

2017-07-07 23:40:09,457 : INFO : collecting all words and their counts
2017-07-07 23:40:09,458 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-07-07 23:40:09,805 : INFO : collected 43446 word types and 6 unique tags from a corpus of 727 examples and 1472087 words
2017-07-07 23:40:09,806 : INFO : Loading a fresh vocabulary
2017-07-07 23:40:10,029 : INFO : min_count=5 retains 13634 unique words (31% of original 43446, drops 29812)
2017-07-07 23:40:10,030 : INFO : min_count=5 leaves 1421890 word corpus (96% of original 1472087, drops 50197)
2017-07-07 23:40:10,065 : INFO : deleting the raw counts dictionary of 43446 items
2017-07-07 23:40:10,067 : INFO : sample=0.001 downsamples 46 most-common words
2017-07-07 23:40:10,068 : INFO : downsampling leaves estimated 1039507 word corpus (73.1% of prior 1421890)
2017-07-07 23:40:10,069 : INFO : estimated required memory for 13634 words and 5 dimensions: 7362480 bytes
2017-07-07 23:40:10,113 : INFO : resetting

CPU times: user 55.3 s, sys: 188 ms, total: 55.5 s
Wall time: 55.5 s


In [14]:
seed = 1422

doc2vec_model.seed = seed
doc2vec_model.random = random.RandomState(seed)


test_targets, test_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent])

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [1000],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

In [18]:
%%time
model_trainer.fit(train_regressors, train_targets)
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   18.5s finished


CPU times: user 3.62 s, sys: 102 ms, total: 3.72 s
Wall time: 21.9 s


In [19]:
yp = model.predict(test_regressors)
yt = test_targets

In [21]:
pd.DataFrame(
    index=pd.Index([1, 2, 3, 4, 5], name="y_true"),
    columns=pd.Index([1, 2, 3, 4, 5], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,1,2,3,4,5
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,16,7,1,0
2,3,36,38,8,0
3,5,47,66,13,0
4,1,24,37,7,0
5,0,1,1,0,0


In [22]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          1       0.10      0.04      0.06        25
          2       0.29      0.42      0.34        85
          3       0.44      0.50      0.47       131
          4       0.24      0.10      0.14        69
          5       0.00      0.00      0.00         2

avg / total       0.33      0.35      0.33       312



  'precision', 'predicted', average, warn_for)


In [None]:
accuracy_score(yt, yp)