In [1]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [5]:
train['Tokens'] = [nltk.word_tokenize(sentence) for sentence in train.Text]
test['Tokens'] = [nltk.word_tokenize(sentence) for sentence in test.Text]

In [6]:
def get_features(tokens):
    out = []
    for word in tokens:
        try:
            out.append(model.word_vec(word))
        except:
            pass
    return out

train_features = [np.sum(get_features(sentence), axis=0) for sentence in train.Tokens]
test_features = [np.sum(get_features(sentence), axis=0) for sentence in test.Tokens]

In [7]:
X_train, X_test = np.array(train_features), np.array(test_features)
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [None]:
inp = Input(shape=(300,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(200, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(100, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(20, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [None]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=15000, nb_epoch=2050,
          verbose=2, validation_split=0.1,
          callbacks=[EarlyStopping(monitor='val_loss', patience=100)])

Train on 92289 samples, validate on 10255 samples
Epoch 1/3500
7s - loss: 2.2503 - acc: 0.1703 - val_loss: 2.6308 - val_acc: 0.1697
Epoch 2/3500
2s - loss: 2.2250 - acc: 0.1793 - val_loss: 2.4077 - val_acc: 0.1659
Epoch 3/3500
2s - loss: 2.2077 - acc: 0.1867 - val_loss: 2.2750 - val_acc: 0.1600
Epoch 4/3500
2s - loss: 2.1865 - acc: 0.1899 - val_loss: 2.1866 - val_acc: 0.1687
Epoch 5/3500
2s - loss: 2.1725 - acc: 0.1994 - val_loss: 2.1234 - val_acc: 0.1836
Epoch 6/3500
2s - loss: 2.1479 - acc: 0.2052 - val_loss: 2.0747 - val_acc: 0.2020
Epoch 7/3500
2s - loss: 2.1333 - acc: 0.2107 - val_loss: 2.0361 - val_acc: 0.2169
Epoch 8/3500
2s - loss: 2.1189 - acc: 0.2169 - val_loss: 2.0055 - val_acc: 0.2297
Epoch 9/3500
2s - loss: 2.1026 - acc: 0.2229 - val_loss: 1.9806 - val_acc: 0.2423
Epoch 10/3500
2s - loss: 2.0948 - acc: 0.2271 - val_loss: 1.9588 - val_acc: 0.2521
Epoch 11/3500
2s - loss: 2.0809 - acc: 0.2336 - val_loss: 1.9394 - val_acc: 0.2623
Epoch 12/3500
2s - loss: 2.0691 - acc: 0.2361 

In [None]:
model.predict(X_test)