In [11]:
import warnings
warnings.simplefilter('ignore')

from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

In [3]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [5]:
model = word2vec.KeyedVectors.load_word2vec_format('../word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
train = pd.read_csv('../data/spacy_train.csv')
test = pd.read_csv('../data/spacy_test.csv')

In [8]:
def get_features(tokens):
    out = []
    for word in tokens:
        try:
            out.append(model.word_vec(word))
        except:
            pass
    return out

train_features = [np.sum(get_features(sentence.split()), axis=0) for sentence in train.Text]
test_features = [np.sum(get_features(sentence.split()), axis=0) for sentence in test.Text]

In [27]:
X_train, X_test = np.array(train_features), np.array(test_features)
y_train = train[['1', '2', '3', '4', '5']].values

In [53]:
inp = Input(shape=(300,))
inp_norm = BatchNormalization(axis=1)(inp)

hidden = Dense(200, init='he_uniform', activation='sigmoid')(inp_norm)
# batch = BatchNormalization(axis=1)(hidden)
drop = Dropout(0.5)(hidden)
hidden = Dense(200, init='he_uniform', activation='sigmoid')(drop)
# batch = BatchNormalization(axis=1)(hidden)
drop = Dropout(0.5)(hidden)
out = Dense(5, init='glorot_uniform', activation='softmax')(drop)

In [54]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 300)               0         
_________________________________________________________________
batch_normalization_40 (Batc (None, 300)               1200      
_________________________________________________________________
dense_57 (Dense)             (None, 200)               60200     
_________________________________________________________________
dropout_35 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_36 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 5)                 1005      
Total para

In [57]:
model.fit(X_train, y_train,
          batch_size=20000, 
          nb_epoch=5000,
          verbose=2, 
          validation_split=0.33,
          callbacks=[EarlyStopping(monitor='val_acc', patience=100, mode='max')])

Train on 68704 samples, validate on 33840 samples
Epoch 1/5000
0s - loss: 0.9501 - acc: 0.5752 - val_loss: 1.0069 - val_acc: 0.5470
Epoch 2/5000
0s - loss: 0.9506 - acc: 0.5743 - val_loss: 1.0079 - val_acc: 0.5475
Epoch 3/5000
0s - loss: 0.9503 - acc: 0.5743 - val_loss: 1.0070 - val_acc: 0.5467
Epoch 4/5000
0s - loss: 0.9499 - acc: 0.5749 - val_loss: 1.0071 - val_acc: 0.5472
Epoch 5/5000
0s - loss: 0.9502 - acc: 0.5742 - val_loss: 1.0076 - val_acc: 0.5484
Epoch 6/5000
0s - loss: 0.9494 - acc: 0.5744 - val_loss: 1.0073 - val_acc: 0.5472
Epoch 7/5000
0s - loss: 0.9498 - acc: 0.5755 - val_loss: 1.0075 - val_acc: 0.5473
Epoch 8/5000
0s - loss: 0.9503 - acc: 0.5738 - val_loss: 1.0078 - val_acc: 0.5473
Epoch 9/5000
0s - loss: 0.9493 - acc: 0.5760 - val_loss: 1.0072 - val_acc: 0.5473
Epoch 10/5000
0s - loss: 0.9474 - acc: 0.5749 - val_loss: 1.0074 - val_acc: 0.5476
Epoch 11/5000
0s - loss: 0.9498 - acc: 0.5732 - val_loss: 1.0072 - val_acc: 0.5478
Epoch 12/5000
0s - loss: 0.9489 - acc: 0.5744 

<keras.callbacks.History at 0x7f25e4a47198>

In [18]:
pred_test = [list(x).index(max(x)) for x in model.predict(X_test)]
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          3
2   2          5
3   3          5
4   4          4
