In [None]:
import pandas as pd

# NLP
from nltk import tokenize

# Keras and TF
import keras as K
from sklearn.model_selection import train_test_split
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf


# GPU usage
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [None]:

# Load training data frame
train_data_path = '..\\..\\dat\\train.tsv'
train_df = pd.read_table(train_data_path)
print('Training DataFrame loaded')

In [None]:
print('Data shape:')
print(train_df.shape)

In [None]:
print('Sample:')
#print(train_df.head(10))
#print(train_df.head())
#train_df.head()
train_df.head(10)

In [None]:
# Filter on the text or item_description
X = train_df['item_description']

In [None]:
# Filter on the target prices
Y = train_df['price']

In [None]:
assert len(X) == len(Y)

In [None]:
# Preprocessing
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NUM_EPOCHS = 100
BATCH_SIZE = 50
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [None]:

reviews = []
labels = []
texts = []

for idx in range(X.shape[0]):
    text = X[idx]
    print(text)
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    
    labels.append(Y[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))


In [None]:
# Split into train and test
'''
random_seed=42
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.33, shuffle=True, random_state=random_seed)
'''

In [None]:
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.init((input_shape[-1],1))
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))
        
        ai = K.exp(eij)
    weights = ai/tf.expand_dims(K.sum(ai, axis=1), 1)
        
    weighted_input = x*weights
    return tf.reduce_sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

# define base model
def baseline_model():
    GLOVE_DIR = "../dat/glove"
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()

    print('Total %s word vectors.' % len(embeddings_index))


    # building Hierachical Attention network
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SENT_LENGTH,
                                trainable=True)
    
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)
    l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
    l_dense = TimeDistributed(Dense(200))(l_lstm)
    l_att = AttLayer()(l_dense)
    sentEncoder = Model(sentence_input, l_att)

    review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
    review_encoder = TimeDistributed(sentEncoder)(review_input)
    l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
    l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
    l_att_sent = AttLayer()(l_dense_sent)
    preds = Dense(1)(l_att_sent)
    model = Model(review_input, preds)

    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['acc'])

    return model

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)


In [None]:

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, data, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))