In [144]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics

In [145]:
model = gensim.models.Word2Vec.load('/home/bahbbc/workspace/masters-big5/models/tweet50.model')
model.init_sims(replace=True)

2017-05-02 22:31:12,252 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/models/tweet50.model
2017-05-02 22:31:35,328 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/models/tweet50.model.wv.* with mmap=None
2017-05-02 22:31:35,330 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/models/tweet50.model.wv.syn0.npy with mmap=None
2017-05-02 22:31:39,318 : INFO : setting ignored attribute syn0norm to None
2017-05-02 22:31:39,318 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/models/tweet50.model.syn1neg.npy with mmap=None
2017-05-02 22:31:43,498 : INFO : setting ignored attribute cum_table to None
2017-05-02 22:31:43,499 : INFO : loaded /home/bahbbc/workspace/masters-big5/models/tweet50.model
2017-05-02 22:31:47,141 : INFO : precomputing L2-norms of word weight vectors


In [146]:
print(model.most_similar(positive=['rss', 'haha']))

[(u'hehe', 0.9136868715286255), (u'hahah', 0.884761393070221), (u'hahaha', 0.8731921315193176), (u'rs', 0.8724417686462402), (u'hehehe', 0.8691849708557129), (u'rsrs', 0.8621789216995239), (u'kk', 0.8570213317871094), (u'kkk', 0.8407293558120728), (u'rsrsrs', 0.8283201456069946), (u'kkkk', 0.8231138586997986)]


In [147]:
print(model.most_similar(positive=[':D', ':/']))

[(u':(', 0.8234219551086426), (u'=/', 0.8158673048019409), (u':))', 0.8133627772331238), (u':p', 0.8122121095657349), (u'^^', 0.8028496503829956), (u'://', 0.8021093010902405), (u':P', 0.8014066219329834), (u';/', 0.7999032139778137), (u';D', 0.7966423034667969), (u':s', 0.7966138124465942)]


In [148]:
print(model.most_similar(positive=['fofo', 'cute']))

[(u'fofinho', 0.7709806561470032), (u'meigo', 0.7032607793807983), (u'lindinho', 0.6972194314002991), (u'fofo,', 0.6967242360115051), (u'fofo!', 0.6729739904403687), (u'kawaii', 0.6723635792732239), (u'fofis', 0.6504534482955933), (u'fofooo', 0.6468209028244019), (u'bonitinho', 0.646500825881958), (u'fofinho,', 0.643779456615448)]


In [149]:
num_features= 300

In [150]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [152]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

In [154]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0norm[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(num_features,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [155]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [156]:
test_tokenized = test_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values
train_tokenized = train_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [157]:
%%time
X_train_word_average = word_averaging_list(model,train_tokenized)
X_test_word_average = word_averaging_list(model,test_tokenized)



CPU times: user 4.36 s, sys: 21.9 ms, total: 4.39 s
Wall time: 4.5 s


In [158]:
del model

In [159]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [1000],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    scoring="f1",
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

## Extraversion evaluation

In [151]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [160]:
%%time
model_trainer.fit(X_train_word_average, train_w2v_data['extraversion_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   25.9s finished


CPU times: user 5.06 s, sys: 109 ms, total: 5.17 s
Wall time: 31.3 s


In [161]:
joblib.dump(model, '/home/bahbbc/workspace/masters-big5/extraversion-best-random-tree-model1.pkl')

['/home/bahbbc/workspace/masters-big5/extraversion-best-random-tree-model1.pkl']

In [162]:
yp = model.predict(X_test_word_average)
yt = test_w2v_data['extraversion_m']

In [163]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,98,44
1,79,91


In [164]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.55      0.69      0.61       142
          1       0.67      0.54      0.60       170

avg / total       0.62      0.61      0.60       312



In [165]:
accuracy_score(yt, yp)

0.60576923076923073

In [166]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [167]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['extraversion_m']

In [168]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       387
          1       1.00      1.00      1.00       340

avg / total       1.00      1.00      1.00       727



In [169]:
print accuracy_score(ytt, ytp)

0.998624484182


## Agreableness evaluation

In [None]:
%%time
model_trainer.fit(X_train_word_average, train_w2v_data['agreeabeness_m'])
model = model_trainer.best_estimator_

In [None]:
yp = model.predict(X_test_word_average)
yt = test_w2v_data['agreeabeness_m']

In [None]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

In [None]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

In [None]:
accuracy_score(yt, yp)

In [None]:
print '--------- TRAIN -----------------

In [None]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['agreeabeness_m']

In [None]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

In [None]:
print accuracy_score(ytt, ytp)

## Conscientiousness evaluation

In [None]:
%%time
model_trainer.fit(X_train_word_average, train_w2v_data['conscientiousness_m'])
model = model_trainer.best_estimator_

In [None]:
yp = model.predict(X_test_word_average)
yt = test_w2v_data['conscientiousness_m']

In [None]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

In [None]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

In [None]:
accuracy_score(yt, yp)

In [None]:
print '--------- TRAIN -----------------'

In [None]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['conscientiousness_m']

In [None]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

In [None]:
print accuracy_score(ytt, ytp)

## Neuroticism avaliation

In [None]:
%%time
model_trainer.fit(X_train_word_average, train_w2v_data['neuroticism_m'])
model = model_trainer.best_estimator_

In [None]:
yp = model.predict(X_test_word_average)
yt = test_w2v_data['neuroticism_m']

In [None]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

In [None]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

In [None]:
accuracy_score(yt, yp)

In [None]:
print '--------- TRAIN -----------------'

In [None]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['neuroticism_m']

In [None]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

In [None]:
print accuracy_score(ytt, ytp)

## Openess evaluation

In [None]:
yp = model.predict(X_test_word_average)
yt = test_w2v_data['openness_m']

In [None]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

In [None]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

In [None]:
accuracy_score(yt, yp)

In [None]:
print '--------- TRAIN -----------------'

In [None]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['openness_m']

In [None]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

In [None]:
print accuracy_score(ytt, ytp)