In [3]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics

2017-06-11 22:35:35,950 : INFO : 'pattern' package found; tag filters are available for English


## Model 1 evaluation

In [4]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [5]:
psico = pd.read_csv('/home/bahbbc/Downloads/psycholinguistic_properties/BP.csv', encoding='utf-8')

In [6]:
psico.head()

Unnamed: 0,Word,Simplified grammatical category,Concretenes,Subjective Frequency,Imagery,AoA,Log frequency,Frequency
0,abafado,a,3.47,3.92,3.96,5.28,7.11,1220.0
1,abafador,a,5.73,2.84,5.1,7.25,4.41,82.0
2,abalado,a,2.61,4.12,3.59,7.62,8.15,3450.0
3,abalizado,a,3.88,2.58,3.56,7.59,4.36,78.0
4,abandonado,a,3.68,4.16,3.97,5.24,9.75,17183.0


In [7]:
concreteness = dict(zip(psico.Word, psico.Concretenes))
subjective_freq = dict(zip(psico.Word, psico['Subjective Frequency']))
imagery = dict(zip(psico.Word, psico.Imagery))
aoa =  dict(zip(psico.Word, psico.AoA))
log_freq = dict(zip(psico.Word, psico['Log frequency']))
freq = dict(zip(psico.Word, psico.Frequency))

In [8]:
df.agreeabeness_m.value_counts()

1    542
0    497
Name: agreeabeness_m, dtype: int64

In [9]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

In [10]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = data['agreeabeness_m']
    evaluate_prediction(predictions, target)

In [11]:
def makeFeatureVec(words, psico_dict):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = 0.
    #
    nwords = 0.
    
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if (word in psico_dict):
            nwords = nwords + 1.
            featureVec = featureVec + psico_dict[word]
    # 
    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = featureVec / nwords
    else:
        print 'sorry, empty...'
    return featureVec


def getAvgFeatureVecs(reviews, psico_dict):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews)),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #Print a status message every 1000th review
        if counter%100. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, psico_dict)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [12]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            word = word.lower()
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [13]:
test_tokenized = test_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values
train_tokenized = train_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [14]:
index = pd.isnull(train_w2v_data.formatted_text)
train_w2v_data.loc[index, 'formatted_text'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
concreteness = dict(zip(psico.Word, psico.Concretenes))
subjective_freq = dict(zip(psico.Word, psico['Subjective Frequency']))
imagery = dict(zip(psico.Word, psico.Imagery))
aoa =  dict(zip(psico.Word, psico.AoA))
log_freq = dict(zip(psico.Word, psico['Log frequency']))
freq = dict(zip(psico.Word, psico.Frequency))

In [16]:
conc_trainDataVecs = getAvgFeatureVecs( train_tokenized, concreteness )
conc_testDataVecs = getAvgFeatureVecs( test_tokenized, concreteness )
sub_trainDataVecs = getAvgFeatureVecs( train_tokenized, subjective_freq )
sub_testDataVecs = getAvgFeatureVecs( test_tokenized, subjective_freq )
ima_trainDataVecs = getAvgFeatureVecs( train_tokenized, imagery )
ima_testDataVecs = getAvgFeatureVecs( test_tokenized, imagery )
aoa_trainDataVecs = getAvgFeatureVecs( train_tokenized, aoa )
aoa_testDataVecs = getAvgFeatureVecs( test_tokenized, aoa )

Review 0 of 727
sorry, empty...
Review 100 of 727
sorry, empty...
Review 200 of 727
sorry, empty...
sorry, empty...
Review 300 of 727
sorry, empty...
Review 400 of 727
sorry, empty...
sorry, empty...
Review 500 of 727
Review 600 of 727
sorry, empty...
sorry, empty...
Review 700 of 727
Review 0 of 312
sorry, empty...
Review 100 of 312
sorry, empty...
sorry, empty...
sorry, empty...
Review 200 of 312
sorry, empty...
sorry, empty...
Review 300 of 312
Review 0 of 727
sorry, empty...
Review 100 of 727
sorry, empty...
Review 200 of 727
sorry, empty...
sorry, empty...
Review 300 of 727
sorry, empty...
Review 400 of 727
sorry, empty...
sorry, empty...
Review 500 of 727
Review 600 of 727
sorry, empty...
sorry, empty...
Review 700 of 727
Review 0 of 312
sorry, empty...
Review 100 of 312
sorry, empty...
sorry, empty...
sorry, empty...
Review 200 of 312
sorry, empty...
sorry, empty...
Review 300 of 312
Review 0 of 727
sorry, empty...
Review 100 of 727
sorry, empty...
Review 200 of 727
sorry, empty

In [17]:
train_data = np.vstack((conc_trainDataVecs,sub_trainDataVecs, ima_trainDataVecs, aoa_trainDataVecs))

In [18]:
train_data = train_data.T

In [19]:
test_data = np.vstack((conc_testDataVecs, sub_testDataVecs, ima_testDataVecs, aoa_testDataVecs))

In [20]:
test_data = test_data.T

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [50],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    scoring="f1",
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

In [22]:
%%time
model_trainer.fit(train_data, train_w2v_data['agreeabeness_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.9s finished


CPU times: user 401 ms, sys: 39.2 ms, total: 441 ms
Wall time: 1.41 s


In [23]:
yp = model.predict(test_data)
yt = test_w2v_data['agreeabeness_m']

In [24]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,65
1,83,88


In [25]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.48      0.54      0.51       141
          1       0.58      0.51      0.54       171

avg / total       0.53      0.53      0.53       312



In [26]:
accuracy_score(yt, yp)

0.52564102564102566

In [27]:
print '--------- TRAIN -----------------'

--------- TRAIN -----------------


In [28]:
ytp = model.predict(train_data)
ytt = train_w2v_data['agreeabeness_m']

In [29]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       356
          1       1.00      1.00      1.00       371

avg / total       1.00      1.00      1.00       727



In [30]:
print accuracy_score(ytt, ytp)

0.998624484182
