In [1]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [23]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [21]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [22]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [39]:
train['Tokens'] = [[word for word in nltk.word_tokenize(sentence) if word.isalnum()] for sentence in train.Text]
test['Tokens'] = [[word for word in nltk.word_tokenize(sentence) if word.isalnum()] for sentence in test.Text]

In [46]:
def get_features(tokens):
    out = []
    for word in tokens:
        try:
            out.append(model.word_vec(word))
        except:
            pass
    return out

train_features = [np.sum(get_features(sentence), axis=0) for sentence in train.Tokens]
test_features = [np.sum(get_features(sentence), axis=0) for sentence in test.Tokens]

In [55]:
X_train, X_test = np.array(train_features), np.array(test_features)
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [61]:
inp = Input(shape=(300,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(200, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(100, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(20, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [62]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=10000, nb_epoch=20000,
          verbose=2, validation_split=0.1,
          callbacks=[EarlyStopping(monitor='val_loss', patience=100)])

Train on 92289 samples, validate on 10255 samples
Epoch 1/20000
6s - loss: 2.2390 - acc: 0.1710 - val_loss: 2.1278 - val_acc: 0.1959
Epoch 2/20000
2s - loss: 2.2173 - acc: 0.1780 - val_loss: 2.0302 - val_acc: 0.2033
Epoch 3/20000
2s - loss: 2.1946 - acc: 0.1847 - val_loss: 2.0004 - val_acc: 0.2118
Epoch 4/20000
2s - loss: 2.1771 - acc: 0.1887 - val_loss: 1.9945 - val_acc: 0.2149
Epoch 5/20000
2s - loss: 2.1602 - acc: 0.1976 - val_loss: 1.9940 - val_acc: 0.2092
Epoch 6/20000
2s - loss: 2.1455 - acc: 0.2015 - val_loss: 1.9920 - val_acc: 0.2096
Epoch 7/20000
2s - loss: 2.1324 - acc: 0.2080 - val_loss: 1.9867 - val_acc: 0.2134
Epoch 8/20000
2s - loss: 2.1105 - acc: 0.2160 - val_loss: 1.9782 - val_acc: 0.2242
Epoch 9/20000
2s - loss: 2.1003 - acc: 0.2188 - val_loss: 1.9667 - val_acc: 0.2373
Epoch 10/20000
2s - loss: 2.0816 - acc: 0.2264 - val_loss: 1.9543 - val_acc: 0.2523
Epoch 11/20000
2s - loss: 2.0660 - acc: 0.2336 - val_loss: 1.9414 - val_acc: 0.2652
Epoch 12/20000
2s - loss: 2.0538 - 

<keras.callbacks.History at 0x7fb727401eb8>

In [None]:
model.predict(X_test)