In [1]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [2]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

train.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in train.Text]
test.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in test.Text]

train['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in train.Tokens], train.index)
test['NewText'] = pd.Series.from_array([' '.join(tokens) for tokens in test.Tokens], test.index)

In [7]:
vectorizer = TfidfVectorizer(analyzer='char',
                             ngram_range=(2, 6),
                             min_df=50,
                             max_df=0.7,
                             max_features=40000,
                             stop_words='english',
                             lowercase=True)
vectorizer.fit(pd.concat([train.NewText, test.NewText]))

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=40000, min_df=50,
        ngram_range=(2, 6), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
train_features = vectorizer.transform(train.NewText)
test_features = vectorizer.transform(test.NewText)

X_train, X_test = train_features, test_features
y_train = np_utils.to_categorical(train.Sentiment, 6)
X_train, 

(<102544x40000 sparse matrix of type '<class 'numpy.float64'>'
 	with 237772875 stored elements in Compressed Sparse Row format>,)

In [11]:
inp = Input(shape=(40000,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(512, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(128, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(16, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [12]:
def batch_generator(X, y, batch_size):
    all_count, i = X.shape[0], 0
    num_batchs = all_count // batch_size
    shuffle_index = np.arange(all_count)
    np.random.shuffle(shuffle_index)
    while True:
        index_batch = shuffle_index[batch_size * i: batch_size * (i + 1)]
        X_batch = X[index_batch,:].todense()
        y_batch = y[index_batch,:]
        i += 1
        yield (np.array(X_batch), y_batch)
        if i == num_batchs:
            np.random.shuffle(shuffle_index)
            i = 0

In [13]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit_generator(generator=batch_generator(X_train, y_train, 1000), 
                    nb_epoch=21, samples_per_epoch=50000, verbose=2)

Epoch 1/21
272s - loss: 2.3205 - acc: 0.2131
Epoch 2/21
261s - loss: 2.1150 - acc: 0.3018
Epoch 3/21
235s - loss: 1.9594 - acc: 0.3738
Epoch 4/21
253s - loss: 1.8871 - acc: 0.4107
Epoch 5/21
259s - loss: 1.7916 - acc: 0.4571
Epoch 6/21
258s - loss: 1.7542 - acc: 0.4773
Epoch 7/21
258s - loss: 1.6910 - acc: 0.5090
Epoch 8/21
273s - loss: 1.6587 - acc: 0.5195
Epoch 9/21
278s - loss: 1.6153 - acc: 0.5410
Epoch 10/21
288s - loss: 1.5876 - acc: 0.5526
Epoch 11/21
284s - loss: 1.5431 - acc: 0.5746
Epoch 12/21
289s - loss: 1.5348 - acc: 0.5734
Epoch 13/21
288s - loss: 1.5014 - acc: 0.5899
Epoch 14/21
286s - loss: 1.4824 - acc: 0.5950
Epoch 15/21
286s - loss: 1.4537 - acc: 0.6094
Epoch 16/21
271s - loss: 1.4397 - acc: 0.6097
Epoch 17/21
233s - loss: 1.4150 - acc: 0.6245
Epoch 18/21
278s - loss: 1.3994 - acc: 0.6312
Epoch 19/21
280s - loss: 1.3799 - acc: 0.6360
Epoch 20/21
279s - loss: 1.3660 - acc: 0.6467
Epoch 21/21
280s - loss: 1.3501 - acc: 0.6493


<keras.callbacks.History at 0x7f0b3cb2ae80>

In [18]:
pred_test = [list(x).index(max(x)) for x in model.predict(X_test.todense())]
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          2
2   2          5
3   3          5
4   4          5
