In [12]:
import warnings
warnings.simplefilter('ignore')

from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

In [4]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
model = word2vec.KeyedVectors.load_word2vec_format('../word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
train = pd.read_csv('../data/spacy_train.csv')
test = pd.read_csv('../data/spacy_test.csv')

In [27]:
vectorizer = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             min_df=20,
                             max_df=0.8,
                             max_features=40000,
                             lowercase=False)
vectorizer.fit(pd.concat([train.Text, test.Text]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.8, max_features=40000, min_df=20,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [28]:
train_features = vectorizer.transform(train.Text)
test_features = vectorizer.transform(test.Text)

X_test = test_features
X_train, X_valid, y_train, y_valid = train_test_split(train_features, 
                                                      train[['1', '2', '3', '4', '5']].values, 
                                                      test_size=0.3)

In [51]:
inp = Input(shape=(40000,))

hidden = Dense(64, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp)

drop = Dropout(0.5)(hidden)
#     hidden = Dense(128, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
#     batch = BatchNormalization(axis=1)(hidden)
#     drop = Dropout(0.25)(batch)
#     hidden = Dense(16, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
#     batch = BatchNormalization(axis=1)(hidden)
#     drop = Dropout(0.5)(batch)
out = Dense(5, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)


In [52]:
def batch_generator(X, y, batch_size):
    all_count, i = X.shape[0], 0
    num_batchs = all_count // batch_size
    shuffle_index = np.arange(all_count)
    np.random.shuffle(shuffle_index)
    while True:
        index_batch = shuffle_index[batch_size * i: batch_size * (i + 1)]
        X_batch = X[index_batch,:].todense()
        y_batch = y[index_batch,:]
        i += 1
        yield (np.array(X_batch), y_batch)
        if i == num_batchs:
            np.random.shuffle(shuffle_index)
            i = 0

In [53]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 40000)             0         
_________________________________________________________________
dense_75 (Dense)             (None, 64)                2560064   
_________________________________________________________________
dropout_53 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_76 (Dense)             (None, 5)                 325       
Total params: 2,560,389.0
Trainable params: 2,560,389.0
Non-trainable params: 0.0
_________________________________________________________________


In [54]:
model.fit_generator(generator=batch_generator(X_train, y_train, 1000), 
                    nb_epoch=21, 
                    samples_per_epoch=50000, 
                    validation_data=batch_generator(X_valid, y_valid, 1000),
                    validation_steps=X_valid.shape[0],
                    verbose=2)

Epoch 1/21


KeyboardInterrupt: 

In [18]:
pred_test = [list(x).index(max(x)) for x in model.predict(X_test.todense())]
prediction = pd.DataFrame(data={'Id': test.Id, 'Sentiment': pred_test}, index=test.index)
prediction.to_csv('data/prediction.csv', index=False)
print(prediction.head())

   Id  Sentiment
0   0          5
1   1          2
2   2          5
3   3          5
4   4          5
