In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(42)
from sklearn.model_selection import cross_val_score
import pandas as pd
import gensim
from sklearn.cross_validation import train_test_split
import nltk
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


Using TensorFlow backend.


In [2]:
from keras.datasets import imdb

In [3]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [4]:
df = pd.read_csv('~/personality-normalized-word2vec-norm.csv', encoding='utf-8')
df.shape

(1039, 186)

In [5]:
index = pd.isnull(df.formatted_text)
df.loc[index, 'formatted_text'] = ''

In [6]:
w2v = gensim.models.Word2Vec.load('/home/bahbbc/workspace/masters-big5/models/tweet50-600.model')
w2v.init_sims(replace=True)

2017-08-29 23:48:04,679 : INFO : loading Word2Vec object from /home/bahbbc/workspace/masters-big5/models/tweet50-600.model
2017-08-29 23:48:12,214 : INFO : loading wv recursively from /home/bahbbc/workspace/masters-big5/models/tweet50-600.model.wv.* with mmap=None
2017-08-29 23:48:12,215 : INFO : loading syn0 from /home/bahbbc/workspace/masters-big5/models/tweet50-600.model.wv.syn0.npy with mmap=None
2017-08-29 23:48:20,072 : INFO : setting ignored attribute syn0norm to None
2017-08-29 23:48:20,075 : INFO : loading syn1neg from /home/bahbbc/workspace/masters-big5/models/tweet50-600.model.syn1neg.npy with mmap=None
2017-08-29 23:49:22,633 : INFO : setting ignored attribute cum_table to None
2017-08-29 23:49:23,524 : INFO : loaded /home/bahbbc/workspace/masters-big5/models/tweet50-600.model
2017-08-29 23:51:08,612 : INFO : precomputing L2-norms of word weight vectors


In [7]:
train_w2v_data, test_w2v_data = train_test_split(df, test_size=0.3, random_state=42)

In [8]:
def w2v_tokenize_text(text):
    tokens = []
    if text is numpy.nan:
        return []
    for sent in nltk.sent_tokenize(text, language='portuguese'):
        for word in nltk.word_tokenize(sent, language='portuguese'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [9]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, numpy.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0norm[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return numpy.zeros(num_features,)

    #mean = gensim.matutils.unitvec(numpy.array(mean).mean(axis=0)).astype(numpy.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return numpy.vstack([word_averaging(wv, review) for review in text_list ])

In [10]:
num_features= 600

In [None]:
test_tokenized = test_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values
train_tokenized = train_w2v_data.apply(lambda r: w2v_tokenize_text(r['formatted_text']), axis=1).values

In [None]:
%%time
X_train_word_average = word_averaging_list(w2v,train_tokenized)
X_test_word_average = word_averaging_list(w2v,test_tokenized)



In [None]:
X_train_word_average.shape

In [None]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(w2v.wv.syn0.shape[0], embedding_vecor_length, input_length=w2v.wv.syn0.shape[1]))
#model.add(Embedding(, , , weights=[weights]))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_w2v_data, train_w2v_data['extraversion_m'], validation_data=(test_w2v_data, test_w2v_data['extraversion_m']), epochs=3, batch_size=64)