In [50]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics

### Verify model with personality

In [51]:
df = pd.read_csv('~/personality-normalized-word2vec-lema.csv', encoding='utf_8')

In [52]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [53]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

### Evaluation code

In [54]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    target_names = ['no', 'yes']
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [55]:
def evaluate_prediction(predictions, target, title="Confusion matrix"):
    print('accuracy %s' % accuracy_score(target, predictions))
    cm = confusion_matrix(target, predictions)
    print('confusion matrix\n %s' % cm)
    print('(row=expected, col=predicted)')
    
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(cm_normalized, title + ' Normalized')

In [56]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = data['extraversion_m']
    evaluate_prediction(predictions, target)

In [57]:
def merge_attributes(conj1, conj2):
    combined_attr = []

    for i in range(0,conj1.shape[0]):
        mixed_attr = np.concatenate((conj1[i], conj2[i]))
        combined_attr.append(mixed_attr)
    return np.vstack(combined_attr)

## Bag of words

In [58]:
index = pd.isnull(train_w2v_data.formatted_text)
train_w2v_data.loc[index, 'formatted_text'] = ''
index = pd.isnull(test_w2v_data.formatted_text)
test_w2v_data.loc[index, 'formatted_text'] = ''

In [59]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = None) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(train_w2v_data.formatted_text)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()
print "Done! \o/"

Creating the bag of words...

Done! \o/


In [60]:
print train_data_features.shape

(727, 19969)


In [61]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(test_w2v_data.formatted_text)
test_data_features = test_data_features.toarray()

In [62]:
personalities = pd.read_csv('~/personality-normalized-word2vec-lema.csv', encoding='utf-8')

In [63]:
personalities = personalities.convert_objects(convert_numeric=True)

  if __name__ == '__main__':


In [64]:
train_data, test_data = train_test_split(personalities, test_size=0.3, random_state=42)

In [65]:
liwc_attr = ['X1funct' ,'X2pronoun' ,'X3ppron','X4i','X5we','X6you','X7shehe','X8they','X9ipron','X10article','X11verb','X12auxverb','X13past','X14present','X15future','X16adverb','X17preps','X18conj','X19negate','X20quant','X21number','X22swear','X23social','X24family','X25friend','X26humans','X27affect','X28posemo','X29negemo','X30anx','X31anger','X32sad','X33cogmech','X34insight','X35cause','X36discrep','X37tentat','X38certain','X39inhib','X40incl','X41excl','X42percept','X43see','X44hear','X45feel','X46bio','X47body','X48health','X49sexual','X50ingest','X51relativ','X52motion','X53space','X54time','X55work','X56achieve','X57leisure','X58home','X59money','X60relig','X61death','X62assent','X63nonfl', 'X64filler']

In [66]:
def remove_nan(atributes):
    train_attributes = np.array(train_data[atributes])
    test_attributes = np.array(test_data[atributes])
    train_attributes[np.isnan(train_attributes)] = 0
    test_attributes[np.isnan(test_attributes)] = 0
    return (train_attributes, test_attributes)

In [67]:
train, test = remove_nan(liwc_attr)

In [68]:
train_merged = merge_attributes(train, train_data_features)
test_merged = merge_attributes(test, test_data_features)

In [69]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [1000],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    scoring="f1",
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

In [70]:
%%time
model_trainer.fit(train_merged, train_data['extraversion_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


CPU times: user 13.5 s, sys: 213 ms, total: 13.8 s
Wall time: 1min 32s


In [71]:
yp = model.predict(test_merged)
yt = test_data['extraversion_m']

In [72]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,49
1,77,93


In [73]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.55      0.65      0.60       142
          1       0.65      0.55      0.60       170

avg / total       0.61      0.60      0.60       312



In [74]:
accuracy_score(yt, yp)

0.59615384615384615