In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import statsmodels.api as sm
from sklearn import metrics as skmetrics

2017-07-07 23:57:40,464 : INFO : 'pattern' package found; tag filters are available for English


### Verify model with personality

In [2]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')

In [3]:
df.shape

(1039, 186)

In [4]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [5]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

### Evaluation code

In [6]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    target_names = ['no', 'yes']
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [7]:
def evaluate_prediction(predictions, target, title="Confusion matrix"):
    print('accuracy %s' % accuracy_score(target, predictions))
    cm = confusion_matrix(target, predictions)
    print('confusion matrix\n %s' % cm)
    print('(row=expected, col=predicted)')
    
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(cm_normalized, title + ' Normalized')

In [8]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = int(data['extraversion'])
    evaluate_prediction(predictions, target)

## Doc2Vec

In [9]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [10]:
def tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

In [11]:
train_tagged = train_w2v_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['formatted_text']), tags=[r.extraversion]), axis=1)

In [12]:
test_tagged = test_w2v_data.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['formatted_text']), tags=[r.extraversion]), axis=1)

In [13]:
%%time
trainsent = train_tagged.values
testsent = test_tagged.values

# simple gensim doc2vec api
doc2vec_model = Doc2Vec(trainsent, workers=1, size=5, iter=20, dm=1)

train_targets, train_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in trainsent])

2017-07-07 23:58:25,414 : INFO : collecting all words and their counts
2017-07-07 23:58:25,415 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-07-07 23:58:25,813 : INFO : collected 43446 word types and 31 unique tags from a corpus of 727 examples and 1472087 words
2017-07-07 23:58:25,814 : INFO : Loading a fresh vocabulary
2017-07-07 23:58:26,046 : INFO : min_count=5 retains 13634 unique words (31% of original 43446, drops 29812)
2017-07-07 23:58:26,046 : INFO : min_count=5 leaves 1421890 word corpus (96% of original 1472087, drops 50197)
2017-07-07 23:58:26,095 : INFO : deleting the raw counts dictionary of 43446 items
2017-07-07 23:58:26,097 : INFO : sample=0.001 downsamples 46 most-common words
2017-07-07 23:58:26,099 : INFO : downsampling leaves estimated 1039507 word corpus (73.1% of prior 1421890)
2017-07-07 23:58:26,100 : INFO : estimated required memory for 13634 words and 5 dimensions: 7369180 bytes
2017-07-07 23:58:26,156 : INFO : resettin

CPU times: user 59.9 s, sys: 236 ms, total: 1min
Wall time: 1min


In [14]:
seed = 1422

doc2vec_model.seed = seed
doc2vec_model.random = random.RandomState(seed)


test_targets, test_regressors = zip(
    *[(doc.tags[0], doc2vec_model.infer_vector(doc.words, steps=20)) for doc in testsent])

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=DecisionTreeRegressor(),
    param_distributions={
        "max_features": ["log2"],
        "random_state": [42],
        "criterion":['mse']
    },
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

In [16]:
%%time
model_trainer.fit(train_regressors, train_targets)
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.2s finished


CPU times: user 316 ms, sys: 59.6 ms, total: 375 ms
Wall time: 481 ms


In [17]:
yp = model.predict(test_regressors)
yt = test_targets

In [18]:
skmetrics.mean_squared_error(yt, yp)

1.2166769764957264

In [22]:
skmetrics.mean_squared_error(yt, yp)**0.5

1.1030308139375464

In [19]:
skmetrics.r2_score(yt, yp)

-0.69296344231725038