In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

2017-08-27 16:11:54,618 : INFO : 'pattern' package found; tag filters are available for English


## Model 1 evaluation

In [2]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [3]:
psico = pd.read_csv('/home/bahbbc/Downloads/psycholinguistic_properties/BP.csv', encoding='utf-8')

In [4]:
psico.head()

Unnamed: 0,Word,Simplified grammatical category,Concretenes,Subjective Frequency,Imagery,AoA,Log frequency,Frequency
0,abafado,a,3.47,3.92,3.96,5.28,7.11,1220.0
1,abafador,a,5.73,2.84,5.1,7.25,4.41,82.0
2,abalado,a,2.61,4.12,3.59,7.62,8.15,3450.0
3,abalizado,a,3.88,2.58,3.56,7.59,4.36,78.0
4,abandonado,a,3.68,4.16,3.97,5.24,9.75,17183.0


In [5]:
concreteness = dict(zip(psico.Word, psico.Concretenes))
subjective_freq = dict(zip(psico.Word, psico['Subjective Frequency']))
imagery = dict(zip(psico.Word, psico.Imagery))
aoa =  dict(zip(psico.Word, psico.AoA))
log_freq = dict(zip(psico.Word, psico['Log frequency']))
freq = dict(zip(psico.Word, psico.Frequency))

In [6]:
def makeFeatureVec(words, psico_dict):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = 0.
    #
    nwords = 0.
    
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if (word in psico_dict):
            nwords = nwords + 1.
            featureVec = featureVec + psico_dict[word]
    # 
    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = featureVec / nwords
    else:
        print 'sorry, empty...'
    return featureVec


def getAvgFeatureVecs(reviews, psico_dict):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews)),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #Print a status message every 1000th review
        if counter%100. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, psico_dict)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [7]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            word = word.lower()
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [8]:
data_tokenized = df.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [9]:
index = pd.isnull(df.formatted_text)
df.loc[index, 'formatted_text'] = ''

In [10]:
concreteness = dict(zip(psico.Word, psico.Concretenes))
subjective_freq = dict(zip(psico.Word, psico['Subjective Frequency']))
imagery = dict(zip(psico.Word, psico.Imagery))
aoa =  dict(zip(psico.Word, psico.AoA))
log_freq = dict(zip(psico.Word, psico['Log frequency']))
freq = dict(zip(psico.Word, psico.Frequency))

In [11]:
conc_trainDataVecs = getAvgFeatureVecs( data_tokenized, concreteness )
sub_trainDataVecs = getAvgFeatureVecs( data_tokenized, subjective_freq )
ima_trainDataVecs = getAvgFeatureVecs( data_tokenized, imagery )
aoa_trainDataVecs = getAvgFeatureVecs( data_tokenized, aoa )

Review 0 of 1039
Review 100 of 1039
sorry, empty...
sorry, empty...
Review 200 of 1039
sorry, empty...
Review 300 of 1039
sorry, empty...
Review 400 of 1039
sorry, empty...
sorry, empty...
sorry, empty...
Review 500 of 1039
sorry, empty...
sorry, empty...
Review 600 of 1039
sorry, empty...
sorry, empty...
sorry, empty...
Review 700 of 1039
Review 800 of 1039
sorry, empty...
sorry, empty...
Review 900 of 1039
sorry, empty...
Review 1000 of 1039
Review 0 of 1039
Review 100 of 1039
sorry, empty...
sorry, empty...
Review 200 of 1039
sorry, empty...
Review 300 of 1039
sorry, empty...
Review 400 of 1039
sorry, empty...
sorry, empty...
sorry, empty...
Review 500 of 1039
sorry, empty...
sorry, empty...
Review 600 of 1039
sorry, empty...
sorry, empty...
sorry, empty...
Review 700 of 1039
Review 800 of 1039
sorry, empty...
sorry, empty...
Review 900 of 1039
sorry, empty...
Review 1000 of 1039
Review 0 of 1039
Review 100 of 1039
sorry, empty...
sorry, empty...
Review 200 of 1039
sorry, empty...
R

In [12]:
data = np.vstack((conc_trainDataVecs,sub_trainDataVecs, ima_trainDataVecs, aoa_trainDataVecs))

In [13]:
data = data.T

## Random Forest

In [15]:
clf = RandomForestClassifier(max_depth=2, random_state=42, criterion= "gini", class_weight="balanced", n_estimators=1000)

## Extraversion

In [16]:
scores = cross_val_score(clf, data, df['extraversion_ober_2'], cv=10, scoring='f1_macro')

In [17]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.27 (+/- 0.09)


## Agreableness

In [18]:
scores = cross_val_score(clf, data, df['agreeableness_ober_2'], cv=10, scoring='f1_macro')

In [19]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.28 (+/- 0.11)


## Conciousness

In [20]:
scores = cross_val_score(clf, data, df['conscientiousness_ober_2'], cv=10, scoring='f1_macro')

In [21]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.28 (+/- 0.10)


## Neuroticism

In [22]:
scores = cross_val_score(clf, data, df['neuroticism_ober_2'], cv=10, scoring='f1_macro')

In [23]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.29 (+/- 0.08)


## Openess

In [24]:
scores = cross_val_score(clf, data, df['openness_ober_2'], cv=10, scoring='f1_macro')

In [25]:
print("F1-score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1-score: 0.24 (+/- 0.08)
