In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics

2017-06-21 11:34:09,903 : INFO : 'pattern' package found; tag filters are available for English


In [2]:
model = gensim.models.Word2Vec.load('/home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model')
model.init_sims(replace=True)

2017-06-21 11:34:10,492 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model
2017-06-21 11:34:25,043 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model.wv.* with mmap=None
2017-06-21 11:34:25,043 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model.wv.syn0.npy with mmap=None
2017-06-21 11:34:28,138 : INFO : setting ignored attribute syn0norm to None
2017-06-21 11:34:28,139 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model.syn1neg.npy with mmap=None
2017-06-21 11:34:45,084 : INFO : setting ignored attribute cum_table to None
2017-06-21 11:34:45,114 : INFO : loaded /home/bahbbc/workspace/masters-big5/models/tweet150-100-skip.model
2017-06-21 11:34:55,414 : INFO : precomputing L2-norms of word weight vectors


In [3]:
num_features = model.wv.syn0.shape[1]

In [4]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [5]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

In [6]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = data['extraversion_m']
    evaluate_prediction(predictions, target)

In [7]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if (word in index2word_set): 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
    # 
    # Divide the result by the number of words to get the average
    if nwords < 0:
        print 'sorry, empty...'
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #Print a status message every 1000th review
        if counter%100. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, model.wv.syn0.shape[1])
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [8]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            word = word.lower()
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [9]:
test_tokenized = test_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values
train_tokenized = train_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [10]:
trainDataVecs = getAvgFeatureVecs( train_tokenized, model, num_features )
testDataVecs = getAvgFeatureVecs( test_tokenized, model, num_features )

Review 0 of 727
Review 100 of 727
Review 200 of 727
Review 300 of 727
Review 400 of 727
Review 500 of 727
Review 600 of 727
Review 700 of 727
Review 0 of 312
Review 100 of 312
Review 200 of 312
Review 300 of 312


In [11]:
del model

In [112]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [1000],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    scoring="f1",
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

## Extraversion evaluation

In [65]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [113]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['extraversion_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   20.6s finished


CPU times: user 4.15 s, sys: 135 ms, total: 4.28 s
Wall time: 25 s


In [114]:
yp = model.predict(testDataVecs)
yt = test_w2v_data['extraversion_m']

In [115]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,94,48
1,78,92


In [116]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.55      0.66      0.60       142
          1       0.66      0.54      0.59       170

avg / total       0.61      0.60      0.60       312



In [118]:
accuracy_score(yt, yp)

0.59615384615384615

In [119]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [120]:
ytp = model.predict(trainDataVecs)
ytt = train_w2v_data['extraversion_m']

In [121]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       387
          1       1.00      1.00      1.00       340

avg / total       1.00      1.00      1.00       727



In [122]:
print accuracy_score(ytt, ytp)

0.998624484182


## Agreableness evaluation

In [123]:
df.agreeabeness_m.value_counts()

1    542
0    497
Name: agreeabeness_m, dtype: int64

In [124]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['agreeabeness_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   23.0s finished


CPU times: user 4.45 s, sys: 113 ms, total: 4.56 s
Wall time: 27.6 s


In [125]:
yp = model.predict(testDataVecs)
yt = test_w2v_data['agreeabeness_m']

In [126]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,74,67
1,72,99


In [127]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.51      0.52      0.52       141
          1       0.60      0.58      0.59       171

avg / total       0.56      0.55      0.56       312



In [128]:
accuracy_score(yt, yp)

0.55448717948717952

In [129]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [130]:
ytp = model.predict(trainDataVecs)
ytt = train_w2v_data['agreeabeness_m']

In [131]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       356
          1       1.00      1.00      1.00       371

avg / total       1.00      1.00      1.00       727



In [132]:
print accuracy_score(ytt, ytp)

1.0


## Conscientiousness evaluation

In [133]:
df.conscientiousness_m.value_counts()

0    523
1    516
Name: conscientiousness_m, dtype: int64

In [134]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['conscientiousness_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   24.2s finished


CPU times: user 4.56 s, sys: 122 ms, total: 4.68 s
Wall time: 28.8 s


In [135]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,74,67
1,72,99


In [136]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.51      0.52      0.52       141
          1       0.60      0.58      0.59       171

avg / total       0.56      0.55      0.56       312



In [137]:
accuracy_score(yt, yp)

0.55448717948717952

In [138]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [139]:
ytp = model.predict(trainDataVecs)
ytt = train_w2v_data['conscientiousness_m']

In [140]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       356
          1       0.99      1.00      1.00       371

avg / total       1.00      1.00      1.00       727



In [141]:
print accuracy_score(ytt, ytp)

0.997248968363


## Neuroticism evation

In [142]:
df.neuroticism_m.value_counts()

0    535
1    504
Name: neuroticism_m, dtype: int64

In [143]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['neuroticism_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.8s finished


CPU times: user 4.37 s, sys: 104 ms, total: 4.47 s
Wall time: 26.3 s


In [144]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,74,67
1,72,99


In [145]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.51      0.52      0.52       141
          1       0.60      0.58      0.59       171

avg / total       0.56      0.55      0.56       312



In [146]:
accuracy_score(yt, yp)

0.55448717948717952

In [147]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [148]:
ytp = model.predict(trainDataVecs)
ytt = train_w2v_data['neuroticism_m']

In [149]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       372
          1       0.99      1.00      1.00       355

avg / total       1.00      1.00      1.00       727



In [150]:
print accuracy_score(ytt, ytp)

0.997248968363


## Openess evaluation

In [151]:
df.openness_m.value_counts()

1    543
0    496
Name: openness_m, dtype: int64

In [152]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['openness_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   25.7s finished


CPU times: user 5.07 s, sys: 105 ms, total: 5.17 s
Wall time: 31 s


In [153]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,74,67
1,72,99


In [154]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.51      0.52      0.52       141
          1       0.60      0.58      0.59       171

avg / total       0.56      0.55      0.56       312



In [155]:
accuracy_score(yt, yp)

0.55448717948717952

In [156]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [157]:
ytp = model.predict(trainDataVecs)
ytt = train_w2v_data['openness_m']

In [158]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       0.99      1.00      1.00       360
          1       1.00      0.99      1.00       367

avg / total       1.00      1.00      1.00       727



In [159]:
print accuracy_score(ytt, ytp)

0.995873452545
