In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from smart_open import smart_open
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from nltk.corpus import stopwords
import os, codecs
%matplotlib inline
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn import metrics as skmetrics

2017-05-28 17:09:50,841 : INFO : 'pattern' package found; tag filters are available for English


## Model 1 evaluation

In [2]:
model = gensim.models.Word2Vec.load('/home/bahbbc/workspace/masters-big5/models/tweet50-skip.model')
model.init_sims(replace=True)

2017-05-28 17:09:51,781 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model
2017-05-28 17:09:58,697 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.wv.* with mmap=None
2017-05-28 17:09:58,698 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.wv.syn0.npy with mmap=None
2017-05-28 17:10:19,164 : INFO : setting ignored attribute syn0norm to None
2017-05-28 17:10:19,487 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model.syn1neg.npy with mmap=None
2017-05-28 17:11:10,027 : INFO : setting ignored attribute cum_table to None
2017-05-28 17:11:10,752 : INFO : loaded /home/bahbbc/workspace/masters-big5/models/tweet50-skip.model
2017-05-28 17:12:42,197 : INFO : precomputing L2-norms of word weight vectors


In [3]:
print(model.most_similar(positive=['rss', 'haha']))

[(u'rs', 0.8271244764328003), (u'hahaha', 0.8050459027290344), (u'hahah', 0.7909172177314758), (u'hehe', 0.7821873426437378), (u'kk', 0.7749631404876709), (u'kkk', 0.7613407373428345), (u'rsrs', 0.757556140422821), (u'hahahaha', 0.755739688873291), (u'hehehe', 0.7339574694633484), (u'kkkk', 0.7331560850143433)]


In [5]:
print(model.most_similar(positive=[':D', ':/']))

[(u':(', 0.7982357740402222), (u';)', 0.7410538196563721), (u':)', 0.7396756410598755), (u'*-*', 0.7111354470252991), (u':c', 0.7103230953216553), (u':3', 0.7049639225006104), (u':))', 0.6996188163757324), (u':P', 0.6954601407051086), (u'u.u', 0.6853086948394775), (u':s', 0.6806108951568604)]


In [6]:
print(model.most_similar(positive=['fofo', 'cute']))

[(u'fofinho', 0.6460293531417847), (u'lindinho', 0.6353486180305481), (u'fofo,', 0.6153388023376465), (u'fofa', 0.6090468764305115), (u'mordivel', 0.6029438972473145), (u'fofoo', 0.5958353281021118), (u'fofoooo', 0.5935646295547485), (u'cute!', 0.5919068455696106), (u'fofo....', 0.5892006158828735), (u'fofooo', 0.5890873074531555)]


In [3]:
num_features = model.wv.syn0.shape[1]

In [4]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [7]:
df.extraversion_m.value_counts()

0    529
1    510
Name: extraversion_m, dtype: int64

In [8]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

In [9]:
def predict(vectorizer, classifier, data):
    data_features = vectorizer.transform(data['formatted_text'])
    predictions = classifier.predict(data_features)
    target = data['extraversion_m']
    evaluate_prediction(predictions, target)

In [11]:
def tfidfWeights(words):
    vectorizer = TfidfVectorizer(encoding='utf-8')
    tfidf = vectorizer.fit_transform(words)
    
    word_dict = {}
    feature_names = vectorizer.get_feature_names()
    for col in tfidf.nonzero()[1]:
        word_dict[col] = tfidf[0, col]
    return tfidf, word_dict

In [9]:
def makeFeatureVec(words, model, num_features, tfidf):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if (word in index2word_set) & (word in tfidf): 
            nwords = nwords + 1.
            word_weighted = np.multiply(model[word], tfidf[word])
            featureVec = np.add(featureVec, word_weighted)
    # 
    # Divide the result by the number of words to get the average
    if nwords > 0:
        featureVec = np.divide(featureVec,nwords)
    else:
        print 'sorry, empty...'
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features, tfidf):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        #Print a status message every 1000th review
        if counter%100. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, 600, tfidf)
        #Increment the counter
        counter = counter + 1
    return reviewFeatureVecs

In [23]:
def w2v_tokenize_text(text):
    tokens = []
    if text is np.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            word = word.lower()
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [3]:
from sklearn.cluster import KMeans

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0


In [5]:
num_clusters = 50

In [8]:
del model

NameError: name 'model' is not defined

In [7]:
import time

start = time.time() # Start time

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

MemoryError: 

In [24]:
test_tokenized = test_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values
train_tokenized = train_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [25]:
index = pd.isnull(train_w2v_data.formatted_text)
train_w2v_data.loc[index, 'formatted_text'] = ''

In [26]:
def tdfidfWeight(text):
    word_dict = {}
    vectorizer = TfidfVectorizer(encoding='utf-8')
    tfidf = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()
    for col in tfidf.nonzero()[1]:
        word_dict[feature_names[col]] = tfidf[0, col]
    return word_dict

In [28]:
word_dict = tdfidfWeight(train_w2v_data.formatted_text)

In [29]:
trainDataVecs = getAvgFeatureVecs( train_tokenized, model, num_features, word_dict )
testDataVecs = getAvgFeatureVecs( test_tokenized, model, num_features, word_dict )

Review 0 of 727
Review 100 of 727
sorry, empty...
Review 200 of 727
sorry, empty...
sorry, empty...
Review 300 of 727
Review 400 of 727
sorry, empty...
sorry, empty...
Review 500 of 727
Review 600 of 727
sorry, empty...
Review 700 of 727
Review 0 of 312
sorry, empty...
Review 100 of 312
sorry, empty...
sorry, empty...
Review 200 of 312
sorry, empty...
sorry, empty...
Review 300 of 312


In [12]:
del model

In [37]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

force = False
model_trainer = RandomizedSearchCV(
    n_iter=1, 
    estimator=RandomForestClassifier(),
    param_distributions={
        "criterion": ["gini"],
        "n_estimators": [1000],
        "max_features": ["log2"],
        "max_depth": [None],
        "bootstrap": [True],
        "oob_score": [True],
        "class_weight": ["balanced"],
        "random_state": [42]
    },
    scoring="f1",
    verbose=True,
    refit=True,
    cv=10,
    n_jobs=-1
)

In [38]:
%%time
model_trainer.fit(trainDataVecs, train_w2v_data['extraversion_m'])
model = model_trainer.best_estimator_

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.7s finished


CPU times: user 5.4 s, sys: 104 ms, total: 5.51 s
Wall time: 32.2 s


In [39]:
yp = model.predict(testDataVecs)
yt = test_w2v_data['extraversion_m']

In [40]:
pd.DataFrame(
    index=pd.Index([0, 1], name="y_true"),
    columns=pd.Index([0, 1], name="y_pred"),
    data=skmetrics.confusion_matrix(y_true=yt, y_pred=yp)
)

y_pred,0,1
y_true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,91,51
1,82,88


In [41]:
print skmetrics.classification_report(y_true=yt, y_pred=yp)

             precision    recall  f1-score   support

          0       0.53      0.64      0.58       142
          1       0.63      0.52      0.57       170

avg / total       0.58      0.57      0.57       312



In [42]:
accuracy_score(yt, yp)

0.57371794871794868

In [None]:
print '--------- TRAIN -----------------'

In [None]:
ytp = model.predict(X_train_word_average)
ytt = train_w2v_data['extraversion_m']

In [None]:
print skmetrics.classification_report(y_true=ytt, y_pred=ytp)

In [None]:
print accuracy_score(ytt, ytp)