In [1]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

KeyboardInterrupt: 

In [None]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [None]:
train.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in train.Text]
test.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in test.Text]

In [None]:
train['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in train.Tokens], train.index)
test['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in test.Tokens], test.index)

In [None]:
vectorizer = TfidfVectorizer(analyzer='char',
                             ngram_range=(2, 6),
                             min_df=50,
                             max_df=0.7,
                             max_features=40000,
                             stop_words='english',
                             lowercase=True)
vectorizer.fit(pd.concat([train.NewText, test.NewText]))

In [None]:
train_features = vectorizer.transform(train.NewText)
test_features = vectorizer.transform(test.NewText)

In [None]:
X_train, X_test = np.array(train_features), np.array(test_features)
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [None]:
inp = Input(shape=(40000,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(512, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(128, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(16, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [None]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=10000, nb_epoch=5000,
          verbose=2, validation_split=0.1,
          callbacks=[EarlyStopping(monitor='val_loss', patience=100)])

In [None]:
model.predict(X_test)