In [9]:
import os
os.environ['OMP_NUM_THREADS'] = '2'

In [10]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

In [11]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [12]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [None]:
train['Tokens'] = [nltk.word_tokenize(sentence) for sentence in train.Text]
test['Tokens'] = [nltk.word_tokenize(sentence) for sentence in test.Text]

In [None]:
def get_features(tokens):
    out = []
    for word in tokens:
        try:
            out.append(model.word_vec(word))
        except:
            pass
    return out

train_features = [np.sum(get_features(sentence), axis=0) for sentence in train.Tokens]
test_features = [np.sum(get_features(sentence), axis=0) for sentence in test.Tokens]

In [None]:
X_train, X_test = np.array(train_features), np.array(test_features)
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [8]:
inp = Input(shape=(300,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(200, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(100, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(20, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [9]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=15000, nb_epoch=2050,
          verbose=2, validation_split=0.1,

          callbacks=[EarlyStopping(monitor='val_loss', patience=100)])

Train on 92289 samples, validate on 10255 samples
Epoch 1/2050
6s - loss: 2.2899 - acc: 0.1483 - val_loss: 4.1181 - val_acc: 0.2751
Epoch 2/2050
2s - loss: 2.2734 - acc: 0.1514 - val_loss: 3.2269 - val_acc: 0.2666
Epoch 3/2050
2s - loss: 2.2628 - acc: 0.1564 - val_loss: 2.7932 - val_acc: 0.2522
Epoch 4/2050
2s - loss: 2.2485 - acc: 0.1620 - val_loss: 2.5423 - val_acc: 0.2356
Epoch 5/2050
2s - loss: 2.2309 - acc: 0.1657 - val_loss: 2.3889 - val_acc: 0.2198
Epoch 6/2050
2s - loss: 2.2167 - acc: 0.1698 - val_loss: 2.2859 - val_acc: 0.2098
Epoch 7/2050
2s - loss: 2.2083 - acc: 0.1722 - val_loss: 2.2137 - val_acc: 0.2027
Epoch 8/2050
2s - loss: 2.1924 - acc: 0.1767 - val_loss: 2.1607 - val_acc: 0.1965
Epoch 9/2050
2s - loss: 2.1837 - acc: 0.1816 - val_loss: 2.1224 - val_acc: 0.1996
Epoch 10/2050
2s - loss: 2.1710 - acc: 0.1871 - val_loss: 2.0919 - val_acc: 0.2060
Epoch 11/2050
2s - loss: 2.1570 - acc: 0.1924 - val_loss: 2.0663 - val_acc: 0.2114
Epoch 12/2050
2s - loss: 2.1503 - acc: 0.1927 

<keras.callbacks.History at 0x7f489a9038d0>

In [18]:
pred_test = [list(x).index(max(x)) for x in model.predict(X_test)]
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          3
2   2          5
3   3          5
4   4          4
