In [1]:
# coding: utf-8

# mainly forking from notebook
# https://www.kaggle.com/johnfarrell/simple-rnn-with-keras-script

# ADDED
# 5x scaled test set
# category name embedding
# some small changes like lr, decay, batch_size~

# In[ ]:
import os
import gc
import time
start_time = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import psutil

def print_memory_usage():
    print('cpu: {}'.format(psutil.cpu_percent()))
    print('consuming {:.2f}GB RAM'.format(psutil.Process(os.getpid()).memory_info().rss / 1073741824),
          flush=True)
    
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv', sep='\t')

train = train[train.price != 0]
train['target'] = np.log1p(train['price'])
# In[ ]:


print(train.shape)
print('5 folds scaling the test_df')
print(test.shape)
test_len = test.shape[0]
def simulate_test(test):
    if test.shape[0] < 800000:
        indices = np.random.choice(test.index.values, 2800000)
        test_ = pd.concat([test, test.iloc[indices]], axis=0)
        return test_.copy()
    else:
        return test
test = simulate_test(test)
print('new shape ', test.shape)
print('[{}] Finished scaling test set...'.format(time.time() - start_time))
print_memory_usage()

# In[ ]:

#HANDLE MISSING VALUES
print("Handling missing values...")
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)
print(train.shape)
print(test.shape)

print('[{}] Finished handling missing data...'.format(time.time() - start_time))
print_memory_usage()

# In[ ]:


#PROCESS CATEGORICAL DATA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
print("Handling categorical variables...")
le = LabelEncoder()

le.fit(np.hstack([train.category_name, test.category_name]))
train['category'] = le.transform(train.category_name)
test['category'] = le.transform(test.category_name)

le.fit(np.hstack([train.brand_name, test.brand_name]))
train['brand'] = le.transform(train.brand_name)
test['brand'] = le.transform(test.brand_name)
del le, train['brand_name'], test['brand_name']

print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time))
# train.head(3)
print_memory_usage()

# In[ ]:




(1481661, 9)
5 folds scaling the test_df
(693359, 7)
new shape  (3493359, 7)
[8.966451406478882] Finished scaling test set...
cpu: 18.6
consuming 1.14GB RAM
Handling missing values...
(1481661, 9)
(3493359, 7)
[9.963890790939331] Finished handling missing data...
cpu: 18.0
consuming 1.14GB RAM
Handling categorical variables...
[45.445863246917725] Finished PROCESSING CATEGORICAL DATA...
cpu: 17.5
consuming 1.23GB RAM


In [2]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
def text_to_wordlist(text, remove_stopwords=True, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

train['item_description'] = train['item_description'].map(lambda x: text_to_wordlist(x))
test['item_description'] = test['item_description'].map(lambda x: text_to_wordlist(x))
print(train.head(1))
print('[{}] Finished text_to_wordlist...'.format(time.time() - start_time))

   train_id                                 name  item_condition_id  \
0         0  MLB Cincinnati Reds T Shirt Size XL                  3   

       category_name  price  shipping item_description    target  category  \
0  Men/Tops/T-shirts   10.0         1  description yet  2.397895       829   

   brand  
0   5263  
[960.4209070205688] Finished text_to_wordlist...


In [3]:
#PROCESS TEXT: RAW
print("Text to seq process...")
print("   Fitting tokenizer...")
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([train.category_name.str.lower(), 
                      train.item_description.str.lower(), 
                      train.name.str.lower()])

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
vocab_size = len(tok_raw.word_index) + 1
print('vocab_size=', vocab_size)

print("   Transforming text to seq...")
train["seq_category_name"] = tok_raw.texts_to_sequences(train.category_name.str.lower())
test["seq_category_name"] = tok_raw.texts_to_sequences(test.category_name.str.lower())
train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())
# train.head(3)

print('[{}] Finished PROCESSING TEXT DATA...'.format(time.time() - start_time))
print_memory_usage()


Text to seq process...
   Fitting tokenizer...


Using TensorFlow backend.


vocab_size= 216567
   Transforming text to seq...
[1176.8381216526031] Finished PROCESSING TEXT DATA...
cpu: 17.5
consuming 4.67GB RAM


In [4]:
%%time
#  with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
#         w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
#                for line in lines}
embeddings_index = dict()
f = open('../input/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((vocab_size, 50))
for word, i in tok_raw.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print('[{}] Finished Embedding matrix...'.format(time.time() - start_time))
print_memory_usage()

Loaded 400000 word vectors.
[1182.579567193985] Finished Embedding matrix...
cpu: 17.7
consuming 4.87GB RAM
CPU times: user 5.67 s, sys: 124 ms, total: 5.79 s
Wall time: 5.72 s


In [5]:
print(embedding_matrix.shape)

(216567, 50)


In [6]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from collections import defaultdict
# class TfidfEmbeddingVectorizer(object):
#     def __init__(self, word2vec):
#         self.word2vec = word2vec
#         self.word2weight = None
# #         self.dim = len(word2vec.itervalues().next())
#         self.dim = len(word2vec.values())
# #         self.dim = 50

#     def fit(self, X, y='optional'):
#         tfidf = TfidfVectorizer(analyzer=lambda x: x)
#         tfidf.fit(X)
#         # if a word was never seen - it must be at least as infrequent
#         # as any of the known words - so the default idf is the max of 
#         # known idf's
#         max_idf = max(tfidf.idf_)
#         self.word2weight = defaultdict(
#             lambda: max_idf,
#             [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

#         return self

#     def transform(self, X):
#         return np.array([
#                 np.mean([self.word2vec[w] * self.word2weight[w]
#                          for w in words if w in self.word2vec] or
#                         [np.zeros(self.dim)], axis=0)
#                 for words in X
#             ])

In [7]:
len(embeddings_index.values())

400000

In [8]:
# tfidf = TfidfEmbeddingVectorizer(embeddings_index)

In [9]:
# tfidf.fit(np.hstack([train.name, test.name]))

In [10]:
# train['tfidf_name'] = tfidf.transform(train.name)

In [11]:
# test['tfidf_name'] = tfidf.transform(test.name)

In [12]:
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
from keras.layers import TimeDistributed
# class AttLayer(Layer):
#     def __init__(self, **kwargs):
#         self.init = initializers.get('normal')
#         #self.input_spec = [InputSpec(ndim=3)]
#         super(AttLayer, self).__init__(** kwargs)

#     def build(self, input_shape):
#         assert len(input_shape)==3
#         #self.W = self.init((input_shape[-1],1))
#         self.W = self.init((input_shape[-1],))
#         #self.input_spec = [InputSpec(shape=input_shape)]
#         self.trainable_weights = [self.W]
#         super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

#     def call(self, x, mask=None):
#         eij = K.tanh(K.dot(x, self.W))

#         ai = K.exp(eij)
#         weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

#         weighted_input = x*weights.dimshuffle(0,1,'x')
#         return weighted_input.sum(axis=1)

#     def get_output_shape_for(self, input_shape):
#         return (input_shape[0], input_shape[-1])

In [13]:
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [14]:
# %%time
# In[ ]:

from keras.layers import TimeDistributed

#EXTRACT DEVELOPTMENT TEST
from sklearn.model_selection import train_test_split
dtrain, dvalid = train_test_split(train, random_state=233, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)

# In[ ]:


#EMBEDDINGS MAX VALUE
#Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 20 #17
MAX_ITEM_DESC_SEQ = 60 #269
MAX_CATEGORY_NAME_SEQ = 20 #8
print(np.max(train.seq_name.max()), np.max(test.seq_name.max()))
print(np.max(train.seq_category_name.max()), np.max(test.seq_category_name.max()))
print(np.max(train.seq_item_description.max()), np.max(train.seq_item_description.max()))

MAX_NAME = np.max([np.max(train.seq_name.max()), np.max(test.seq_name.max())])+1
MAX_CAT_NAME = np.max([np.max(train.seq_category_name.max()), np.max(test.seq_category_name.max())])+1
MAX_ITEM_DESC = np.max([np.max(train.seq_item_description.max()), np.max(test.seq_item_description.max())])+1

print('MAX_NAME=',MAX_NAME)
print('MAX_CAT_NAME=',MAX_CAT_NAME)
print('MAX_ITEM_DESC=',MAX_ITEM_DESC)

MAX_TEXT = np.max([np.max(train.seq_name.max())
                   , np.max(test.seq_name.max())
                   , np.max(train.seq_category_name.max())
                   , np.max(test.seq_category_name.max())
                   , np.max(train.seq_item_description.max())
                   , np.max(test.seq_item_description.max())])+2
MAX_CATEGORY = np.max([train.category.max(), test.category.max()])+1
MAX_BRAND = np.max([train.brand.max(), test.brand.max()])+1
MAX_CONDITION = np.max([train.item_condition_id.max(), 
                        test.item_condition_id.max()])+1
MAX_TEXT=max(MAX_TEXT, vocab_size)
print('MAX_TEXT=',MAX_TEXT)
print('MAX_CATEGORY=',MAX_CATEGORY)
print('MAX_BRAND=',MAX_BRAND)
print('MAX_CONDITION=',MAX_CONDITION)

print('[{}] Finished EMBEDDINGS MAX VALUE...'.format(time.time() - start_time))
print_memory_usage()

# In[ ]:


#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description
                                    , maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand': np.array(dataset.brand)
        ,'category': np.array(dataset.category)
        ,'category_name': pad_sequences(dataset.seq_category_name
                                        , maxlen=MAX_CATEGORY_NAME_SEQ)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(test)

print('[{}] Finished DATA PREPARARTION...'.format(time.time() - start_time))
print_memory_usage()

# In[ ]:


#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, \
    Activation, concatenate, GRU, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers

def rmsle(y, y_pred):
    import math
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 \
              for i, pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

dr = 0.25

def get_model():
    #params
    dr_r = dr
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand = Input(shape=[1], name="brand")
    category = Input(shape=[1], name="category")
    category_name = Input(shape=[X_train["category_name"].shape[1]], 
                          name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_size = 60
    
    emb_name = Embedding(MAX_TEXT, 50, weights=[embedding_matrix], trainable=True)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50, weights=[embedding_matrix], trainable=True)(item_desc)
    emb_category_name = Embedding(MAX_TEXT, emb_size//3)(category_name)
#     emb_category_name = Embedding(MAX_TEXT, 100, weights=[embedding_matrix], trainable=True)(category_name)
    emb_brand = Embedding(MAX_BRAND, 10)(brand)
    emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
#     emb_name = Embedding(MAX_TEXT, emb_size//3)(name)
#     emb_item_desc = Embedding(MAX_TEXT, emb_size)(item_desc)
#     emb_category_name = Embedding(MAX_TEXT, emb_size//3)(category_name)
#     emb_brand = Embedding(MAX_BRAND, 10)(brand)
#     emb_category = Embedding(MAX_CATEGORY, 10)(category)
#     emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    rnn_layer1 = GRU(20, return_sequences=True, dropout=0.2, recurrent_dropout=0.2) (emb_item_desc)
    att1 = Attention(MAX_ITEM_DESC_SEQ)(rnn_layer1)
    
    rnn_layer2 = GRU(6, return_sequences=True, dropout=0.2, recurrent_dropout=0.2) (emb_category_name)
    att2 = Attention(MAX_CATEGORY_NAME_SEQ)(rnn_layer2)
    
#     rnn_layer3 = GRU(20, return_sequences=True) (emb_name)
    rnn_layer3 = GRU(15, return_sequences=True, dropout=0.2, recurrent_dropout=0.2) (emb_name)
    att3 = Attention(MAX_NAME_SEQ)(rnn_layer3)
    
    #main layer
    main_l = concatenate([
        Flatten() (emb_brand)
        , Flatten() (emb_category)
        , Flatten() (emb_item_condition)    
        , att1
        , att2
        , att3
        , num_vars
    ])
    
    main_l = Dropout(0.1)(Dense(512,activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(64,activation='relu') (main_l))
    
    # main_l = Dropout(0.1)(Dense(512,activation='relu') (main_l))
    # main_l = Dropout(0.25)(Dense(256,activation='relu') (main_l))
    # main_l = Dropout(0.5)(Dense(32,activation='relu') (main_l))
    #output
    output = Dense(1,activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand
                   , category, category_name
                   , item_condition, num_vars], output)
    #optimizer = optimizers.RMSprop()
    optimizer = optimizers.Adam()
    model.compile(loss="mse", 
                  optimizer=optimizer)
    return model

def eval_model(model):
    val_preds = model.predict(X_valid)
    val_preds = np.expm1(val_preds)
    
    y_true = np.array(dvalid.price.values)
    y_pred = val_preds[:, 0]
    v_rmsle = rmsle(y_true, y_pred)
    print(" RMSLE error on dev test: "+str(v_rmsle))
    return v_rmsle
#fin_lr=init_lr * (1/(1+decay))**(steps-1)
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1

print('[{}] Finished DEFINEING MODEL...'.format(time.time() - start_time))
print_memory_usage()

# In[ ]:

gc.collect()
#FITTING THE MODEL
epochs = 3
BATCH_SIZE = 512 * 5
steps = int(len(X_train['name'])//BATCH_SIZE) * epochs
lr_init, lr_fin = 0.013, 0.009
lr_decay = exp_decay(lr_init, lr_fin, steps)
log_subdir = '_'.join(['ep', str(epochs),
                    'bs', str(BATCH_SIZE),
                    'lrI', str(lr_init),
                    'lrF', str(lr_fin),
                    'dr', str(dr)])

model = get_model()
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

earlystop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=1, \
                          verbose=1, mode='auto')
                          
history = model.fit(X_train, dtrain.target
                    , epochs=epochs
                    , batch_size=BATCH_SIZE
                    , validation_split=0.01
                    , callbacks=[earlystop]
                    , verbose=10
                    )
print('[{}] Finished FITTING MODEL...'.format(time.time() - start_time))
#EVLUEATE THE MODEL ON DEV TEST
v_rmsle = eval_model(model)
print('[{}] Finished predicting valid set...'.format(time.time() - start_time))
print_memory_usage()
# In[ ]:


#CREATE PREDICTIONS
# preds = model.predict(X_test, batch_size=BATCH_SIZE)
# preds = np.expm1(preds)
# print('[{}] Finished predicting test set...'.format(time.time() - start_time))
# submission = test[["test_id"]][:test_len]
# submission["price"] = preds[:test_len]
# submission.to_csv("../cache/myNN"+log_subdir+"_{:.6}.csv".format(v_rmsle), index=False)
# print('[{}] Finished submission...'.format(time.time() - start_time))
# print_memory_usage()



(1466844, 13)
(14817, 13)
216565 216557
428 428
169504 169504
MAX_NAME= 216566
MAX_CAT_NAME= 429
MAX_ITEM_DESC= 216365
MAX_TEXT= 216567
MAX_CATEGORY= 1311
MAX_BRAND= 5288
MAX_CONDITION= 6
[1190.0684022903442] Finished EMBEDDINGS MAX VALUE...
cpu: 18.2
consuming 5.10GB RAM
[1256.1259169578552] Finished DATA PREPARARTION...
cpu: 17.6
consuming 7.16GB RAM
[1256.1284539699554] Finished DEFINEING MODEL...
cpu: 33.3
consuming 7.16GB RAM
Train on 1452175 samples, validate on 14669 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[2102.917825937271] Finished FITTING MODEL...
 RMSLE error on dev test: 0.43787504155300155
[2106.420909881592] Finished predicting valid set...
cpu: 80.1
consuming 7.62GB RAM


In [15]:
train.category.mean()

798.51865845156215

In [16]:
train.brand.mean()

3883.4210065595303

In [17]:
# 3 attentions:
# RMSLE error on dev test: 0.43621085979926566, 0.43431049774899066,
#  0.4338830005110395 with dropout 0.1 (24, 8, 15)
# 0.43218515551974823 (24, 8, 15, 0.2)
# 0.4287631303060977 (20, 6, 15, 0.2), 0.4394812043885648 (15, 6, 10, 0.2)
# 0.43787504155300155 (20, 6, 16, 0.2)

# attention on name: 50, 10, 10, 5
# RMSLE error on dev test: 0.4347287568069609
    
# 50.txt, 10, 10, 5
# 0.4245, 0.426444

# glove.6B.50.txt instead of 100,
# RMSLE error on dev test: 0.4275900360816371

# emb_brand = Embedding(MAX_BRAND, 20)(brand)
# 0.42693247595039546

# when 20 embedding length, 6 for category for brand
# 0.42482011805535536

# when 2 embedded weights can be trained,
# RMSLE error on dev test: 4242845146375708

# when 3 embedded weights can be trained,
# RMSLE error on dev test: 0.4347576557957351

# org: 
# [2123.6134021282196] Finished FITTING MODEL...
# RMSLE error on dev test: 0.43528846581148545
# [2355.4364523887634] Finished submission...