In [24]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [25]:
model = gensim.models.Word2Vec.load('/home/bahbbc/workspace/masters-big5/models/tweet50-skip.model')
model.init_sims(replace=True)

2017-08-26 19:52:06,214 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model
2017-08-26 19:52:12,567 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.wv.* with mmap=None
2017-08-26 19:52:12,567 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.wv.syn0.npy with mmap=None
2017-08-26 19:52:20,435 : INFO : setting ignored attribute syn0norm to None
2017-08-26 19:52:20,437 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.syn1neg.npy with mmap=None
2017-08-26 19:52:55,425 : INFO : setting ignored attribute cum_table to None
2017-08-26 19:52:55,559 : INFO : loaded /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model
2017-08-26 19:54:29,518 : INFO : precomputing L2-norms of word weight vectors


In [26]:
num_features = model.wv.syn0.shape[1]

In [27]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [28]:
index = pd.isnull(df.formatted_text)
df.loc[index, 'formatted_text'] = ''

In [29]:
def tdfidfWeight(text):
    word_dict = {}
    vectorizer = TfidfVectorizer(encoding='utf-8')
    tfidf = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()
    for col in tfidf.nonzero()[1]:
        word_dict[feature_names[col]] = tfidf[0, col]
    return word_dict

In [30]:
def makeFeatureVec(words, model, num_features, tfidf):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if (word in index2word_set) & (word in tfidf): 
            nwords = nwords + 1.
            word_weighted = np.multiply(model[word], tfidf[word])
            featureVec = np.add(featureVec, word_weighted)
    # 
    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec,nwords)
    else:
        print 'sorry, empty...'
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features, tfidf):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #Print a status message every 1000th review
        if counter%100. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, 600, tfidf)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [31]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            word = word.lower()
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [32]:
word_dict = tdfidfWeight(df.formatted_text)

In [33]:
data_tokenized = df.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [34]:
data = getAvgFeatureVecs( data_tokenized, model, num_features, word_dict )

Review 0 of 1039
Review 100 of 1039
sorry, empty...
Review 200 of 1039
sorry, empty...
Review 300 of 1039
Review 400 of 1039
sorry, empty...
sorry, empty...
Review 500 of 1039
sorry, empty...
sorry, empty...
Review 600 of 1039
sorry, empty...
sorry, empty...
sorry, empty...
Review 700 of 1039
Review 800 of 1039
sorry, empty...
Review 900 of 1039
sorry, empty...
Review 1000 of 1039


In [35]:
del model

## Random Forest

In [36]:
clf = RandomForestClassifier(max_depth=2, random_state=42, criterion= "gini", class_weight="balanced", n_estimators=1000)

## Extraversion evaluation

In [37]:
scores = cross_val_score(clf, data, df['extraversion_m'], cv=10, scoring='f1_macro')

In [38]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.59 (+/- 0.08)


## Agreableness evaluation

In [39]:
scores = cross_val_score(clf, data, df['agreeabeness_m'], cv=10, scoring='f1_macro')

In [40]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.52 (+/- 0.14)


## Conscientiousness evaluation

In [41]:
scores = cross_val_score(clf, data, df['conscientiousness_m'], cv=10, scoring='f1_macro')

In [42]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.54 (+/- 0.05)


## Neuroticism evation

In [43]:
scores = cross_val_score(clf, data, df['neuroticism_m'], cv=10, scoring='f1_macro')

In [44]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.46 (+/- 0.08)


## Openess evaluation

In [45]:
scores = cross_val_score(clf, data, df['openness_m'], cv=10, scoring='f1_macro')

In [46]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.52 (+/- 0.08)
