In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '8'

In [2]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, merge
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer, base_filter

Using TensorFlow backend.


In [3]:
import numpy as np
import pandas as pd
import nltk
from gensim.models import word2vec
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
model = word2vec.KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
train = pd.read_csv('data/train.data', sep='\t')
test = pd.read_csv('data/test.data', sep='\t')

In [6]:
train.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in train.Text]
test.Tokens = [nltk.word_tokenize(sentence.replace('\\n', ' ')) for sentence in test.Text]

In [7]:
def get_features(tokens):
    out = []
    for word in tokens:
        try:
            out.extend(model.word_vec(word))
        except:
            pass
    return out

def batch_generator(X, y, batch_size):
    all_count, i = X.shape[0], 0
    num_batchs = all_count // batch_size
    shuffle_index = np.arange(all_count)
    np.random.shuffle(shuffle_index)
    while True:
        index_batch = shuffle_index[batch_size * i: batch_size * (i + 1)]
        batch = np.array([get_features(tokens) for tokens in X[index_batch, :].Tokens])
        X_batch = np.zeros((batch_size, 100000), dtype=type(batch[0][0]))
        for i in range(batch_size):
            X_batch[i, :len(batch)] = batch[i]
        y_batch = y[index_batch,:]
        
        i += 1
        yield (np.array(X_batch), y_batch)
        if i == num_batchs:
            np.random.shuffle(shuffle_index)
            i = 0

In [8]:
X_train, X_test = train.Tokens, test.Tokens
y_train = np_utils.to_categorical(train.Sentiment, 6)

In [9]:
inp = Input(shape=(40000,))
inp_norm = BatchNormalization(axis=1)(inp)

outs = []
for i in range(3):
    hidden = Dense(512, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(inp_norm)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(128, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(batch)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.25)(batch)
    hidden = Dense(16, init='he_uniform', W_regularizer=l2(0.0001), activation='relu')(drop)
    batch = BatchNormalization(axis=1)(hidden)
    drop = Dropout(0.5)(batch)
    out = Dense(6, init='glorot_uniform', W_regularizer=l2(0.0001), activation='softmax')(drop)
    outs.append(out)

out = merge(outs, mode='ave')

In [11]:
model = Model(input=inp, output=out)
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit_generator(generator=batch_generator(X_train, y_train, 1000), 
                    nb_epoch=10000, samples_per_epoch=50000, verbose=2)

Epoch 1/10000


ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/opt/lab/anaconda/envs/python3/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/opt/lab/anaconda/envs/python3/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/lab/anaconda/envs/python3/lib/python3.5/site-packages/keras/engine/training.py", line 409, in data_generator_task
    generator_output = next(generator)
  File "<ipython-input-7-4f513c06e73d>", line 11, in batch_generator
    all_count, i = X.shape[0], 0
AttributeError: 'list' object has no attribute 'shape'



In [None]:
model.predict(X_test)