In [1]:
import gensim
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier



In [2]:
model = gensim.models.Word2Vec.load("/home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model")
model.init_sims(replace=True)

2017-08-27 11:36:53,573 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model
2017-08-27 11:36:55,709 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model.wv.* with mmap=None
2017-08-27 11:36:55,710 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model.wv.syn0.npy with mmap=None
2017-08-27 11:36:57,575 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model.syn1neg.npy with mmap=None
2017-08-27 11:36:59,405 : INFO : setting ignored attribute syn0norm to None
2017-08-27 11:36:59,406 : INFO : setting ignored attribute cum_table to None
2017-08-27 11:36:59,407 : INFO : loaded /home/bahbbc/workspace/masters-big5/wiki.pt-br.word2vec.model
2017-08-27 11:37:01,335 : INFO : precomputing L2-norms of word weight vectors


In [3]:
num_features = 400

### Verify model with personality

In [4]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [5]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

### Word2Vec model training

In [6]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    #print words.shape
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0norm[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)
        #print mean

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(num_features,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [7]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [8]:
data = df.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [9]:
%%time
data_word_average = word_averaging_list(model, data)



CPU times: user 5.21 s, sys: 22.7 ms, total: 5.23 s
Wall time: 5.21 s


In [10]:
clf = RandomForestClassifier(max_depth=2, random_state=42, criterion= "gini", class_weight="balanced", n_estimators=1000)

## Extraversion

In [11]:
scores = cross_val_score(clf, data_word_average, df['extraversion_ober_2'], cv=10, scoring='f1_macro')

In [12]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.33 (+/- 0.09)


## Agreableness

In [13]:
scores = cross_val_score(clf, data_word_average, df['agreeableness_ober_2'], cv=10, scoring='f1_macro')

In [14]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.28 (+/- 0.05)


## Consciousness

In [15]:
scores = cross_val_score(clf, data_word_average, df['conscientiousness_ober_2'], cv=10, scoring='f1_macro')

In [16]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.31 (+/- 0.10)


## Neuroticism

In [17]:
scores = cross_val_score(clf, data_word_average, df['neuroticism_ober_2'], cv=10, scoring='f1_macro')

In [18]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.29 (+/- 0.08)


## Openess

In [19]:
scores = cross_val_score(clf, data_word_average, df['openness_ober_2'], cv=10, scoring='f1_macro')

In [20]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.25 (+/- 0.09)
