In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '8'

In [2]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [3]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [6]:
train.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in train.Text]
test.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in test.Text]

In [None]:
train['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in train.Tokens], train.index)
test['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in test.Tokens], test.index)

In [None]:
vectorizer = TfidfVectorizer(analyzer='char',
                             ngram_range=(2, 6),
                             min_df=50,
                             max_df=0.7,
                             max_features=40000,
                             stop_words='english',
                             lowercase=True)
vectorizer.fit(pd.concat([train.NewText, test.NewText]))

In [None]:
train_features = vectorizer.transform(train.NewText)
test_features = vectorizer.transform(test.NewText)

In [None]:
X_train, X_test = train_features, test_features
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [11]:
inp = Input(shape=(40000,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(512, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(128, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(16, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [12]:
def batch_generator(X, y, batch_size):
    all_count, i = X.shape[0], 0
    num_batchs = all_count // batch_size
    shuffle_index = np.arange(all_count)
    np.random.shuffle(shuffle_index)
    while True:
        index_batch = shuffle_index[batch_size * i: batch_size * (i + 1)]
        X_batch = X[index_batch,:].todense()
        y_batch = y[index_batch,:]
        i += 1
        yield (np.array(X_batch), y_batch)
        if i == num_batchs:
            np.random.shuffle(shuffle_index)
            i = 0

In [None]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit_generator(generator=batch_generator(X_train, y_train, 1000), 
                    nb_epoch=10000, samples_per_epoch=50000, verbose=2)

Epoch 1/10000
147s - loss: 2.2542 - acc: 0.2613
Epoch 2/10000
139s - loss: 2.0536 - acc: 0.3515
Epoch 3/10000
140s - loss: 1.8922 - acc: 0.4254
Epoch 4/10000
142s - loss: 1.8307 - acc: 0.4511
Epoch 5/10000
167s - loss: 1.7375 - acc: 0.4911
Epoch 6/10000
149s - loss: 1.7106 - acc: 0.5002
Epoch 7/10000
139s - loss: 1.6518 - acc: 0.5259
Epoch 8/10000
140s - loss: 1.6184 - acc: 0.5398
Epoch 9/10000


KeyboardInterrupt: 

In [None]:
model.predict(X_test)