In [1]:
import pickle
import numpy as np

In [2]:
import pickle
with open('./data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [3]:
data2[0].items()

dict_items([('end', 8.2357), ('a', 2.5), ('emo_evo', [['neu'], ['neu'], ['neu'], ['neu']]), ('emotion', 'neu'), ('start', 6.2901), ('id', 'Ses01F_impro01_F000'), ('v', 2.5), ('d', 2.5), ('transcription', 'Excuse me.')])

In [10]:
d = {}
emotions = {'ang':0, 'exc':1, 'neu':2, 'sad':3}
origin_target = []
origin_id = []
origin_data = []

for i, ses_mod in enumerate(data2):
    d[ses_mod['id']] = (ses_mod['transcription'], emotions[ses_mod['emotion']], i)
    origin_id.append(ses_mod['id'])
    origin_target.append(emotions[ses_mod['emotion']])
    origin_data.append(ses_mod['transcription'])
print('\ntotal num of sentences', len(d.keys()))



total num of sentences 4936


In [11]:
origin_train = origin_id[:3948]
origin_test = origin_id[3948:]
split_as_in_original_paper = {'train': origin_train, 'test':origin_test}
with open('split_as_in_original_paper.pickle', 'wb') as handle:
    pickle.dump(split_as_in_original_paper, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('./split_as_in_original_paper.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
print(data['train'][:4])
print(data['test'][:4])
len(origin_train)

['Ses01F_impro01_F000', 'Ses01F_impro01_F001', 'Ses01F_impro01_F002', 'Ses01F_impro01_F005']
['Ses05F_impro04_M009', 'Ses05F_impro04_M010', 'Ses05F_impro04_M011', 'Ses05F_impro04_M012']


3948

In [13]:
train_X = origin_data[:3948]
train_y = origin_target[:3948]

test_X = origin_data[3948:]
test_y = origin_target[3948:]
print('number of sentences in train:', len(train_X))
print('number of sentences in test:', len(test_X))

number of sentences in train: 3948
number of sentences in test: 988


In [14]:
l = [0]*4
for i in train_y:
    l[i] += 1
print('class distribution in training data', np.array(l)/sum(l))

class distribution in training data [0.23860182 0.19832827 0.34017224 0.22289767]


In [15]:
train_y[:10]

[2, 2, 2, 2, 0, 2, 0, 0, 3, 3]

In [16]:
l = [0]*4
for i in test_y:
    l[i] += 1
print('class distribution in test data', np.array(l)/sum(l))

class distribution in test data [0.16295547 0.2611336  0.3694332  0.20647773]


In [17]:
train_X[:4]

['Excuse me.',
 'Yeah.',
 'Is there a problem?',
 "Well what's the problem?  Let me change it."]

In [18]:
train_y[:4]

[2, 2, 2, 2]

## Doc2Vec for extracting text feature
The original paper considers cocatenating each pre-trained glove word vector to produce a sentense vector. This is a straightforward approach to provide a quick and crude document-vector that can often be useful. However, Le and Mikolov https://arxiv.org/abs/1405.4053 in 2014 introduced the Paragraph Vector, which may outperforms such simple-concatenation. We use state-of-art gensim Doc2Vec class to generate the doc2vec paragraph vector. Specifically, there are two kinds of paragraph vector in that paper:
Paragraph Vector, aka gensim Doc2Vec

**Paragraph Vector - Distributed Memory (PV-DM)**<br>
This is the Paragraph Vector model analogous to Word2Vec CBOW. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a center word based an average of both context word-vectors and the full document's doc-vector.

**Paragraph Vector - Distributed Bag of Words (PV-DBOW)**<br>
This is the Paragraph Vector model analogous to Word2Vec SG. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a target word just from the full document's doc-vector. (It is also common to combine this with skip-gram testing, using both the doc-vector and nearby word-vectors to predict a single target word, but only one at a time.)

<img src='https://cdn-images-1.medium.com/max/1000/1*9tVCGDm-ytPydhtJWVx3Zw.png'>

In [19]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import gensim
import multiprocessing

In [20]:
docs = []
all_text = train_X + test_X
for i, line in enumerate(all_text):
    #  tokenize text into individual words, remove punctuation, set to lowercase, etc
    line = gensim.utils.simple_preprocess(line)
    docs.append(gensim.models.doc2vec.TaggedDocument(line, [i]))
docs[0]

TaggedDocument(words=['excuse', 'me'], tags=[0])

In [21]:
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
print('num of cores', cores)

num of cores 32


In [56]:
doc2vec_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=250, negative=5, hs=0, min_count=2, sample=0, 
            epochs=150, workers=cores),
    # PV-DM w/ default averaging; 
    Doc2Vec(dm=1, vector_size=250, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=150, workers=cores, alpha=0.05, comment='alpha=0.05'),
#     PV-DM w/ concatenation - big, slow
#     window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=128, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=150, workers=cores),
]

for model in doc2vec_models:
    model.build_vocab(docs)
    print("%s vocabulary scanned & state initialized" % model)

Doc2Vec(dbow,d250,n5,mc2,t32) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d250,n5,w10,mc2,t32) vocabulary scanned & state initialized
Doc2Vec(dm/c,d128,n5,w5,mc2,t32) vocabulary scanned & state initialized


**Combining both PV-DM and PV-DBOW**<br>
Le and Mikolov notes that combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) improves performance. Since PV-DM has two kinds of mechanisms(averaging and concatenation), We will follow, pairing the models together for evaluation.

In [57]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
doc2vec_models.append(ConcatenatedDoc2Vec([doc2vec_models[0], doc2vec_models[1]])) # dbow+dm_averaging
doc2vec_models.append(ConcatenatedDoc2Vec([doc2vec_models[0], doc2vec_models[2]])) # dbow+dm_concatenation

for i in range(3):
    doc2vec_models[i].train(docs, total_examples=len(docs), epochs=doc2vec_models[i].epochs)

In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

To evaluate which model provides the best features, we run a linear svm (polynomial or gausian provides worse result) on the training data and calculate cross valication accuracy. As we can see, the PV-DBOW(model 0) provides the best accuracy among the three single model. After combining PV-DM-concatenation(model 4), the accuracy improves a little bit. Thus we choose the model 4 as the final doc2vec model 

In [59]:
for i, model in enumerate(doc2vec_models):
    # use only train data to do cross validation
    num_data = len(train_X)
    trainvecs = [model.docvecs[i] for i in range(num_data)]
    svm = SVC(kernel='linear')
    svm.fit(trainvecs, train_y)
    print('model', i, model)
    print('cross val accu for svm is', np.mean(cross_val_score(svm, trainvecs, train_y[:num_data], cv=5)))
    print()

model 0 Doc2Vec(dbow,d250,n5,mc2,t32)
cross val accu for svm is 0.5311187690861002

model 1 Doc2Vec("alpha=0.05",dm/m,d250,n5,w10,mc2,t32)
cross val accu for svm is 0.46832225415835627

model 2 Doc2Vec(dm/c,d128,n5,w5,mc2,t32)
cross val accu for svm is 0.33788345693004407

model 3 Doc2Vec(dbow,d250,n5,mc2,t32)+Doc2Vec("alpha=0.05",dm/m,d250,n5,w10,mc2,t32)
cross val accu for svm is 0.5321375425018966

model 4 Doc2Vec(dbow,d250,n5,mc2,t32)+Doc2Vec(dm/c,d128,n5,w5,mc2,t32)
cross val accu for svm is 0.5387265640986316



In [94]:
num_train_data = len(train_X)
docvec_train = np.array([doc2vec_models[4].docvecs[i] for i in range(num_train_data)])
docvec_test = np.array([doc2vec_models[4].docvecs[i] for i in range(num_train_data, 4936)])
all_docvec = np.array([doc2vec_models[4].docvecs[i] for i in range(4936)])
print(docvec_train.shape)
print(docvec_test.shape)

(3948, 378)
(988, 378)


In [61]:
svm = SVC(kernel='linear')
svm.fit(docvec_train, train_y)
svm.score(docvec_test, test_y)

0.5516194331983806

## Add information of previous sentences
We suspect previous sentences may help to predict the emotion of current sentence. We define a variable `time_steps` which determines the length of time series. We are going to reshape the training samples of shape ```(num_sentences, features)``` to ```(num_sentences/time_steps, time_steps, features)```. Since ```num_sentences``` may not be divisible by ```time_steps```, we duplicate number of ```needed``` samples in the beginning of the training set to the end of the training set to form a new training set of shape ```(num_sentences+needed, feature)```. After reshaped, the shape is ```((num_sentences+needed)/time_steps, time_steps, features)```

In [29]:
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation
from keras.layers import LSTM, Input, Flatten, Embedding, Convolution1D,Dropout

Using TensorFlow backend.


In [93]:
one_hot_train_y = np.eye(4)[train_y]
one_hot_test_y = np.eye(4)[test_y]

To evaluate whether previous sentences provide useful information to the classification, we first generate a 3 layers sequential DNN. We can see it is able to achieve accuracy 61%-62%. However, With a `time_step` 16, the LSTM model generated below provides a better performance, achieving an accuracy 67%-68%, which demonstrates previous sentences contains very useful informations

In [72]:
# dnn model for baseline
validation_data = (docvec_test, one_hot_test_y)
model = Sequential()
model.add(Dense(350))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])
hist = model.fit(docvec_train, one_hot_train_y, validation_data=validation_data,
                 batch_size=100, nb_epoch=20, verbose=1)



Train on 3948 samples, validate on 988 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [73]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_57 (Dense)             (None, 350)               132650    
_________________________________________________________________
activation_57 (Activation)   (None, 350)               0         
_________________________________________________________________
dropout_26 (Dropout)         (None, 350)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 256)               89856     
_________________________________________________________________
activation_58 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_27 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 4)                 1028      
__________

In [83]:
from keras import backend as K
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
out_features = func([all_docvec, 0])[0]
out_features.shape

dense_58


(4936, 256)

In [84]:
with open('./text_features_doc2vec(dnn_accu_62%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

In [91]:
def make_lstm_samples(train_X, train_y, test_X, test_y, time_steps):
    ''' reshape the training samples of shape (num_sentences, features) to (num_sentences/time_steps, time_steps, features). 
    Since num_sentences may not be divisible by time_steps, we duplicate number of needed samples in the beginning of the 
    training set to the end of the training set to form a new training set of shape (num_sentences+needed, feature). 
    After reshaped, the shape is ((num_sentences+needed)/time_steps, time_steps, features)
    input:
        train_X: ndarray of shape(None, features)
        train_y: ndarray of shape(None, 1)
        text_X: ndarray of shape(None, features)
        test_y: ndarray of shape(None, 1)
        time_steps: int
    '''
    features = train_X.shape[1]
    if len(train_X) % time_steps == 0:
        train_needed = 0
    else:
        train_needed = time_steps - (len(train_X) % time_steps)
    train_X_reshaped = np.concatenate((train_X, train_X[:train_needed]), axis=0).reshape(-1, time_steps, features)
    one_hot_train_y = np.eye(4)[train_y]
    train_y_reshaped = np.concatenate((one_hot_train_y, one_hot_train_y[:train_needed]), axis=0).reshape(-1, time_steps, 4)
    
    if len(test_X) % time_steps == 0:
        test_needed = 0
    else:
        test_needed = time_steps - (len(test_X) % time_steps)
    test_X_reshaped = np.concatenate((test_X, test_X[:test_needed]), axis=0).reshape(-1, time_steps, features)
    one_hot_test_y = np.eye(4)[test_y]
    test_y_reshaped = np.concatenate((one_hot_test_y, one_hot_test_y[:test_needed]), axis=0).reshape(-1, time_steps, 4)
    return train_X_reshaped, train_y_reshaped, test_X_reshaped, test_y_reshaped

In [95]:
time_steps = 16
features = docvec_train.shape[1]
docvec_X_3d, docvec_y_3d, docvec_test_X_3d, docvec_test_y_3d = make_lstm_samples(docvec_train, train_y, docvec_test, test_y, time_steps)
validation_data = (docvec_test_X_3d, docvec_test_y_3d)
print(docvec_X_3d.shape)

model = Sequential()
model.add(LSTM(256, dropout_U = 0.2, dropout_W = 0.2, input_shape=(time_steps, features), return_sequences=True))
model.add(Dense(350))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

hist = model.fit(docvec_X_3d, docvec_y_3d, validation_data=validation_data,
                 batch_size=100, nb_epoch=20, verbose=1)

(247, 16, 378)




Train on 247 samples, validate on 62 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [96]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 16, 256)           650240    
_________________________________________________________________
dense_63 (Dense)             (None, 16, 350)           89950     
_________________________________________________________________
activation_63 (Activation)   (None, 16, 350)           0         
_________________________________________________________________
dropout_29 (Dropout)         (None, 16, 350)           0         
_________________________________________________________________
dense_64 (Dense)             (None, 16, 256)           89856     
_________________________________________________________________
activation_64 (Activation)   (None, 16, 256)           0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 16, 256)           0         
__________

In [97]:
pred_ = model.predict(docvec_test_X_3d)
pred = pred_.reshape(-1, 4)[:988] # we only need the first 988 samples
label = np.argmax(pred, axis=1)
test_acc = np.mean(np.equal(label, test_y))
test_acc

0.6811740890688259

In [100]:
def reshape_to_3d(data, time_steps):
    features = data.shape[1]
    if len(data) % time_steps == 0:
        needed = 0
    else:
        needed = time_steps - (len(data) % time_steps)
    reshaped = np.concatenate((data, data[:needed]), axis=0).reshape(-1, time_steps, features)
    return reshaped

In [103]:
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
all_docvec_reshaped = reshape_to_3d(all_docvec, time_steps)
out_features = func([all_docvec_reshaped, 0])[0]
out_features = out_features.reshape(-1, 256)
out_features = out_features[:4936]

dense_64


In [105]:
with open('./text_features_doc2vec(lstm_accu_67%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

## Skip thought vector for extracting text feature
The above feature extraction throught LSTM to add previous sentences information to the current sentence is a supervised learning process. We suspect that thought unsupervised learning, we may also be able to add previous sentences information, resulting in better performance combined with supervised LSTM.<br>

In the paper [Skip-Thought Vectors by Kiros et. al](https://arxiv.org/abs/1506.06726), they describe an approach which is to train an encoderdecoder model that tries to reconstruct the surrounding sentences of an encoded passage. Sentences that share semantic and syntactic properties are thus mapped to similar vector representations

<img src='http://sanyam5.github.io/images/skip-thoughts/skip-overview.png'>

**Skip-Thoughts model has three parts:**

**Encoder Network**: Takes the sentence x(i) at index i and generates a fixed length representation z(i). This is a recurrent network (generally GRU or LSTM) that takes the words in a sentence sequentially.

**Previous Decoder Network**: Takes the embedding z(i) and “tries” to generate the sentence x(i-1). This also is a recurrent network (generally GRU or LSTM) that generates the sentence sequentially.

**Next Decoder Network**: Takes the embedding z(i) and “tries” to generate the sentence x(i+1). Again a recurrent network similar to the Previous Decoder Network.

The end product of Skip-Thoughts is the Encoder. The Decoders are thrown away after training.

To generate the skip-thoughts vectors, we follow the approach released by [ryankiros](https://github.com/ryankiros/skip-thoughts). Since it only support python2.7, we generate the skip-thought vector in another notebook and import it here.

In [177]:
st_vec = np.load('./skip-thought-4800.npy')
st_vec.shape

(4936, 4800)

In [178]:
num_train_samples = 3948
st_train = st_vec[:num_train_samples]
st_test = st_vec[num_train_samples:]
st_train.shape

(3948, 4800)

In [181]:
# dnn model for baseline
validation_data = (st_test, one_hot_test_y)
model = Sequential()
model.add(Dense(600))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

hist = model.fit(st_train, one_hot_train_y, validation_data=validation_data,
                 batch_size=200, nb_epoch=10, verbose=1)



Train on 3948 samples, validate on 988 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [182]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_198 (Dense)            (None, 600)               2880600   
_________________________________________________________________
activation_168 (Activation)  (None, 600)               0         
_________________________________________________________________
dropout_83 (Dropout)         (None, 600)               0         
_________________________________________________________________
dense_199 (Dense)            (None, 256)               153856    
_________________________________________________________________
activation_169 (Activation)  (None, 256)               0         
_________________________________________________________________
dropout_84 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_200 (Dense)            (None, 4)                 1028      
__________

In [184]:
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
out_features = func([st_vec, 0])[0]
out_features.shape

dense_199


(4936, 256)

In [185]:
with open('./text_features_skip-thought(dnn_accu_62%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

In [193]:
time_steps = 16
features = st_train.shape[1]
st_train_X_3d, st_train_y_3d, st_test_X_3d, st_test_y_3d = make_lstm_samples(st_train, train_y, st_test, test_y, time_steps)
validation_data = (st_test_X_3d, st_test_y_3d)
print(st_train_X_3d.shape)

model = Sequential()
model.add(LSTM(256, dropout_U = 0.2, dropout_W = 0.2, input_shape=(time_steps, features), return_sequences=True))
model.add(Dense(2000))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

hist = model.fit(st_train_X_3d, st_train_y_3d, validation_data=validation_data,
                 batch_size=100, nb_epoch=25, verbose=1)

(247, 16, 4800)




Train on 247 samples, validate on 62 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [195]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_18 (LSTM)               (None, 16, 256)           5178368   
_________________________________________________________________
dense_222 (Dense)            (None, 16, 2000)          514000    
_________________________________________________________________
activation_192 (Activation)  (None, 16, 2000)          0         
_________________________________________________________________
dropout_99 (Dropout)         (None, 16, 2000)          0         
_________________________________________________________________
dense_223 (Dense)            (None, 16, 256)           512256    
_________________________________________________________________
activation_193 (Activation)  (None, 16, 256)           0         
_________________________________________________________________
dropout_100 (Dropout)        (None, 16, 256)           0         
__________

In [194]:
# since we add needed samples to test_X to make it divisible by times_steps, it has more sentences than the original test set.
# we only need to slice the first 988 samples, which is the number of sentences in the original test set.
pred_ = model.predict(st_test_X_3d)
pred = pred_.reshape(-1, 4)[:988] # we only need the first 988 samples
label = np.argmax(pred, axis=1)
test_acc = np.mean(np.equal(label, test_y))
test_acc

0.6993927125506073

In [198]:
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
all_st_vec_reshaped = reshape_to_3d(st_vec, time_steps)
out_features = func([all_st_vec_reshaped, 0])[0]
out_features = out_features.reshape(-1, 256)
out_features = out_features[:4936]
out_features.shape

dense_223


(4936, 256)

In [199]:
with open('./text_features_skip-thought(lstm_accu_69%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

## Sent2vec for extracting text feature
In the paper [Unsupervised Learning of Sentence Embeddings using Compositional N-Gram Features], they introduce a new model for sentence embeddings called Sent2Vec. It can be thought of as an extension of FastText and word2vec (CBOW) to sentences. The sentence embedding is defined as the average of the source word embeddings of its constituent words. This model is furthermore augmented by also learning source embeddings for not only unigrams but also n-grams of words present in each sentence, and averaging the n-gram embeddings along with the words. Since it has been shown to outperform doc2vec in a lot of situations in the paper, I plan to use it in this task.<br>

To generate the sent2vec, we follow the approach released by the origin author Matteo Pagliardini (https://github.com/epfml/sent2vec). We use the pretrained model [sent2vec_wiki_unigrams](https://drive.google.com/open?id=0B6VhzidiLvjSa19uYWlLUEkzX3c) 5GB (600dim, trained on english wikipedia) released by them to generate our vectors. 


In [227]:
sentvec = np.load('sentvec-600.npy')
sentvec.shape

(4936, 600)

In [228]:
num_train_samples = 3948
sentvec_train = sentvec[:num_train_samples]
sentvec_test = sentvec[num_train_samples:]
sentvec_train.shape

(3948, 600)

In [230]:
# dnn model for baseline
validation_data = (sentvec_test, one_hot_test_y)
model = Sequential()
model.add(Dense(400))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

hist = model.fit(sentvec_train, one_hot_train_y, validation_data=validation_data,
                 batch_size=200, nb_epoch=20, verbose=1)



Train on 3948 samples, validate on 988 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [231]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_234 (Dense)            (None, 400)               240400    
_________________________________________________________________
activation_204 (Activation)  (None, 400)               0         
_________________________________________________________________
dropout_107 (Dropout)        (None, 400)               0         
_________________________________________________________________
dense_235 (Dense)            (None, 256)               102656    
_________________________________________________________________
activation_205 (Activation)  (None, 256)               0         
_________________________________________________________________
dropout_108 (Dropout)        (None, 256)               0         
_________________________________________________________________
dense_236 (Dense)            (None, 4)                 1028      
__________

In [232]:
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
out_features = func([sentvec, 0])[0]
out_features.shape

dense_235


(4936, 256)

In [234]:
with open('./text_features_sent2vec(dnn_accu_64%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

In [249]:
time_steps = 16
features = sentvec_train.shape[1]
sentvec_train_X_3d, sentvec_train_y_3d, sentvec_test_X_3d, sentvec_test_y_3d = make_lstm_samples(sentvec_train, train_y, sentvec_test, test_y, time_steps)
validation_data = (sentvec_test_X_3d, sentvec_test_y_3d)
print(sentvec_train_X_3d.shape)

model = Sequential()
model.add(LSTM(256, dropout_U = 0.2, dropout_W = 0.2, input_shape=(time_steps, features), return_sequences=True))
model.add(Dense(350))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(256))
model.add(Activation('tanh'))
model.add(Dropout(0.2))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam' ,metrics=['acc'])

hist = model.fit(sentvec_train_X_3d, sentvec_train_y_3d, validation_data=validation_data,
                 batch_size=200, nb_epoch=40, verbose=1)

(247, 16, 600)




Train on 247 samples, validate on 62 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [242]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_25 (LSTM)               (None, 16, 256)           877568    
_________________________________________________________________
dense_255 (Dense)            (None, 16, 350)           89950     
_________________________________________________________________
activation_225 (Activation)  (None, 16, 350)           0         
_________________________________________________________________
dropout_121 (Dropout)        (None, 16, 350)           0         
_________________________________________________________________
dense_256 (Dense)            (None, 16, 256)           89856     
_________________________________________________________________
activation_226 (Activation)  (None, 16, 256)           0         
_________________________________________________________________
dropout_122 (Dropout)        (None, 16, 256)           0         
__________

In [250]:
# since we add needed samples to test_X to make it divisible by times_steps, it has more sentences than the original test set.
# we only need to slice the first 988 samples, which is the number of sentences in the original test set.
pred_ = model.predict(sentvec_test_X_3d)
pred = pred_.reshape(-1, 4)[:988] # we only need the first 988 samples
label = np.argmax(pred, axis=1)
test_acc = np.mean(np.equal(label, test_y))
test_acc

0.7095141700404858

In [251]:
feature_layer = model.layers[-5]
print(feature_layer.name)
func = K.function([model.input, K.learning_phase()], [feature_layer.output])
all_sentvec_reshaped = reshape_to_3d(sentvec, time_steps)
out_features = func([all_sentvec_reshaped, 0])[0]
out_features = out_features.reshape(-1, 256)
out_features = out_features[:4936]
out_features.shape

dense_274


(4936, 256)

In [252]:
with open('./text_features_sent2vec(lstm_accu_70%)_256.txt', 'w') as f:
    for r, row in enumerate(out_features):
        row = [str(i) for i in row]
        row = ' '.join(row)
        f.write(row+'\n')

In [260]:
with open('./text_features_doc2vec(lstm_accu_68%)_256.txt', 'r') as f:
    data = f.readlines()
    print(len(data))

4936
