In [69]:
# coding: utf-8

# mainly forking from notebook
# https://www.kaggle.com/johnfarrell/simple-rnn-with-keras-script

# ADDED
# 5x scaled test set
# category name embedding
# some small changes like lr, decay, batch_size~

# In[ ]:
import os
import gc
import time
import psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler
from keras.preprocessing.text import Tokenizer

#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, \
    Activation, concatenate, GRU, Embedding, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, Normalizer

In [70]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 2 ** 14
NUM_PARTITIONS = 12 #number of partitions to split dataframe
NUM_CORES = 4

In [71]:
def rmsle(y, y_pred):
    import math
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 \
              for i, pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

def print_memory_usage():
    print('cpu: {}'.format(psutil.cpu_percent()))
    print('consuming {:.2f}GB RAM'.format(
           psutil.Process(os.getpid()).memory_info().rss / 1073741824),
          flush=True)

dr = 0.25

In [72]:
start_time = time.time()

In [73]:
train = pd.read_csv('../data/train.tsv', sep='\t')
test = pd.read_csv('../data/test.tsv', sep='\t')

train = train[train.price != 0]
train['target'] = np.log1p(train['price'])
test_id = test.test_id.values
y = np.log1p(train['price']).values
print_memory_usage()

cpu: 13.8
consuming 24.27GB RAM


In [74]:
print(train.shape)
print('5 folds scaling the test_df')
print(test.shape)
test_len = test.shape[0]
def simulate_test(test):
    if test.shape[0] < 800000:
        indices = np.random.choice(test.index.values, 2800000)
        test_ = pd.concat([test, test.iloc[indices]], axis=0)
        return test_.copy()
    else:
        return test
test = simulate_test(test)
print('new shape ', test.shape)
print('[{}] Finished scaling test set...'.format(time.time() - start_time))
print_memory_usage()

(1481661, 9)
5 folds scaling the test_df
(693359, 7)
new shape  (3493359, 7)
[706.0695507526398] Finished scaling test set...
cpu: 14.4
consuming 24.50GB RAM


In [75]:
#HANDLE MISSING VALUES
print("Handling missing values...")
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)
print(train.shape)
print(test.shape)

print('[{}] Finished handling missing data...'.format(time.time() - start_time))
print_memory_usage()

Handling missing values...
(1481661, 9)
(3493359, 7)
[707.1836385726929] Finished handling missing data...
cpu: 18.1
consuming 24.50GB RAM


In [76]:
#PROCESS CATEGORICAL DATA

print("Handling categorical variables...")
# le = LabelBinarizer(sparse_output=True)

# lb = LabelBinarizer(sparse_output=True)
# X_brand = lb.fit_transform(merge['brand_name'])
# print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))
# print(X_brand.shape)
# del merge['brand_name']
# print_memory_usage()

# lb = LabelBinarizer(sparse_output=True)
# X_bci = lb.fit_transform(merge['bci'])
# print('[{}] Finished label binarize `bci`'.format(time.time() - start_time))
# print(X_bci.shape)
# del merge['bci']
# print_memory_usage()

# lb = LabelBinarizer(sparse_output=True)
# X_bcis = lb.fit_transform(merge['bcis'])
# print('[{}] Finished label binarize `bcis`'.format(time.time() - start_time))
# print(X_bcis.shape)
# del merge['bcis']
# gc.collect()
# print_memory_usage()

# lb = LabelBinarizer(sparse_output=True)
# X_bcs = lb.fit_transform(merge['bcs'])
# print('[{}] Finished label binarize `bcs`'.format(time.time() - start_time))
# print(X_bcs.shape)
# del merge['bcs']
# gc.collect()
# print_memory_usage()

# le = LabelEncoder()
# X_cat_name = lb.fit_transform(merge['category_name'])
# scaler = MaxAbsScaler()
# X_cat_name = scaler.fit_transform(X_cat_name)

le = LabelEncoder()
le.fit(np.hstack([train.category_name, test.category_name]))
train['category'] = le.transform(train.category_name)
test['category'] = le.transform(test.category_name)

le.fit(np.hstack([train.brand_name, test.brand_name]))
train['brand'] = le.transform(train.brand_name)
test['brand'] = le.transform(test.brand_name)
del le, train['brand_name'], test['brand_name']

print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time))
print_memory_usage()
# train.head(3)


Handling categorical variables...
[742.4417433738708] Finished PROCESSING CATEGORICAL DATA...
cpu: 16.6
consuming 24.55GB RAM


In [77]:
# cv = CountVectorizer(min_df=NAME_MIN_DF, stop_words='english')
# X_name = cv.fit_transform(merge['name'])
# norm = Normalizer()
# X_name = norm.fit_transform(X_name)
# print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))
# print(X_name.shape)
# print(np.min(X_name))
# print(np.max(X_name))
# del merge['name']
# print_memory_usage()

# cv = CountVectorizer()
# X_category = cv.fit_transform(merge['category_name'])
# norm = Normalizer()
# X_category = norm.fit_transform(X_category)
# print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))
# print(X_category.shape)
# print(np.min(X_category))
# print(np.max(X_category))
# del merge['category_name']
# gc.collect()
# print_memory_usage()

# tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
#                                          ngram_range=(1, 3),
#                                          stop_words='english')
# X_description = tv.fit_transform(merge['item_description'])
# print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))
# print(X_description.shape)
# print(np.min(X_description))
# print(np.max(X_description))
# del merge['item_description']
# print_memory_usage()

In [78]:
print(train.head(3))

   train_id                                 name  item_condition_id  \
0         0  MLB Cincinnati Reds T Shirt Size XL                  3   
1         1     Razer BlackWidow Chroma Keyboard                  3   
2         2                       AVA-VIV Blouse                  1   

                                       category_name  price  shipping  \
0                                  Men/Tops/T-shirts   10.0         1   
1  Electronics/Computers & Tablets/Components & P...   52.0         0   
2                        Women/Tops & Blouses/Blouse   10.0         1   

                                    item_description    target  category  \
0                                 No description yet  2.397895       829   
1  This keyboard is in great condition and works ...  3.970292        86   
2  Adorable top with a hint of lace and a key hol...  2.397895      1277   

   brand  
0   5263  
1   3887  
2   4586  


In [79]:
#PROCESS TEXT: RAW
print("Text to seq process...")
print("   Fitting tokenizer...")

raw_text = np.hstack([train.category_name.str.lower(), 
                      train.item_description.str.lower(), 
                      train.name.str.lower()])

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)


print("   Transforming text to seq...")
train["seq_category_name"] = tok_raw.texts_to_sequences(train.category_name.str.lower())
test["seq_category_name"] = tok_raw.texts_to_sequences(test.category_name.str.lower())
train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())



print_memory_usage()

Text to seq process...
   Fitting tokenizer...
   Transforming text to seq...
cpu: 16.0
consuming 27.42GB RAM


In [80]:
train.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand,seq_category_name,seq_item_description,seq_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,10.0,1,No description yet,2.397895,829,5263,"[77, 41, 71, 72]","[13, 88, 102]","[2491, 8914, 6992, 71, 99, 7, 199]"
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,52.0,0,This keyboard is in great condition and works ...,3.970292,86,3887,"[62, 922, 828, 3281, 1381]","[33, 2747, 11, 8, 50, 18, 1, 257, 65, 21, 1219...","[10839, 25624, 16431, 2747]"
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,10.0,1,Adorable top with a hint of lace and a key hol...,2.397895,1277,4586,"[2, 41, 75, 277]","[702, 74, 10, 5, 5465, 12, 244, 1, 5, 992, 140...","[7728, 10643, 277]"


In [81]:
# encoded_docs = t.texts_to_matrix(docs, mode='count')
# print(encoded_docs)

In [82]:
# X_train_tfidf_item_desc = tok_raw.texts_to_matrix(train.item_description.str.lower(), mode='tfidf')
# X_test_tfidf_item_desc = tok_raw.texts_to_matrix(test.item_description.str.lower(), mode='tfidf')

# X_train_count_name = tok_raw.sequences_to_matrix(train.seq_name, mode='count')
# X_test_count_name = tok_raw.texts_to_matrix(test.name.str.lower(), mode='count')


In [83]:
cv = CountVectorizer(min_df=NAME_MIN_DF, stop_words='english')
X_name = cv.fit_transform(train['name'])

In [84]:
X_name.shape

(1481661, 17514)

In [85]:
X_name.dtype

dtype('int64')

In [86]:
print(train.head(3))

print('[{}] Finished PROCESSING TEXT DATA...'.format(time.time() - start_time))
print_memory_usage()

   train_id                                 name  item_condition_id  \
0         0  MLB Cincinnati Reds T Shirt Size XL                  3   
1         1     Razer BlackWidow Chroma Keyboard                  3   
2         2                       AVA-VIV Blouse                  1   

                                       category_name  price  shipping  \
0                                  Men/Tops/T-shirts   10.0         1   
1  Electronics/Computers & Tablets/Components & P...   52.0         0   
2                        Women/Tops & Blouses/Blouse   10.0         1   

                                    item_description    target  category  \
0                                 No description yet  2.397895       829   
1  This keyboard is in great condition and works ...  3.970292        86   
2  Adorable top with a hint of lace and a key hol...  2.397895      1277   

   brand           seq_category_name  \
0   5263            [77, 41, 71, 72]   
1   3887  [62, 922, 828, 3281, 1381] 

In [87]:
#EXTRACT DEVELOPTMENT TEST
from sklearn.model_selection import train_test_split
train = train.reset_index(drop=True)
dtrain, dvalid = train_test_split(train, random_state=233, train_size=0.99)

# dtrain_n, dvalid_n = train_test_split(X_name, random_state=233, train_size=0.99)

print(dtrain.shape)
print(dvalid.shape)

# print(dtrain_n.shape)
# print(dvalid_n.shape)

print_memory_usage()



(1466844, 13)
(14817, 13)
cpu: 14.1
consuming 27.88GB RAM


In [88]:
dtrain.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand,seq_category_name,seq_item_description,seq_name
314536,314685,Old Navy Halter Tank Top Built-in Bra M,3,Women/Tops & Blouses/Halter,11.0,0,Old Navy Halter Tank Top Ties behind neck Buil...,2.484907,1279,3429,"[2, 41, 75, 997]","[449, 245, 997, 157, 74, 1613, 3989, 404, 1126...","[449, 245, 997, 157, 74, 1126, 8, 190, 164]"
1409178,1410006,NEW 21st BIRTHDAY SASH!!!,1,Kids/Toys/Party Supplies,12.0,0,New :) Tags: 21 twenty one party decor decorat...,2.564949,716,5263,"[35, 130, 843, 260]","[6, 80, 356, 4974, 52, 843, 875, 3464, 7386, 1...","[6, 7795, 938, 7386]"
845827,846289,Women's Adidas Superstar,2,Women/Shoes/Fashion Sneakers,40.0,0,"-Authentic, No Box -Color: Black/ Rose Gold -N...",3.713572,1228,5208,"[2, 39, 301, 351]","[126, 13, 76, 61, 26, 341, 128, 6, 46, 38, 58,...","[58, 393, 4690]"
947848,948376,Grey Converse size 7,4,Women/Shoes/Fashion Sneakers,14.0,1,"Are not new, have some dirt on the white rubbe...",2.70805,1228,1143,"[2, 39, 301, 351]","[29, 51, 6, 55, 182, 1650, 15, 3, 79, 1009, 90...","[239, 740, 7, 83]"
1240269,1240982,Bradford exchange pearl necklace,3,Women/Jewelry/Necklaces,12.0,1,Beautiful Tahitian Pearl on 18 inch necklace b...,2.564949,1204,5263,"[2, 106, 349]","[212, 14985, 1122, 15, 364, 604, 312, 131, 182...","[18233, 3405, 1122, 312]"


In [89]:
tr_id = dtrain.index
v_id = dvalid.index

In [90]:
merge = pd.concat([train, test], axis=0)

In [91]:
%%time
from sklearn.decomposition import TruncatedSVD
cv = CountVectorizer(min_df=NAME_MIN_DF, stop_words='english')
cv = cv.fit(merge['name'])
tr_x1 = cv.transform(train['name'])
n_components = 1000
pca = TruncatedSVD(n_components)
tr_x1 = pca.fit_transform(tr_x1)
# tr_x1 = m_x1[:len(train)]
# te_x1 = m_x1[len(train):]
# d_x1, v_x1 = train_test_split(tr_x1, random_state=233, train_size=0.99)


CPU times: user 22min 23s, sys: 2min 30s, total: 24min 54s
Wall time: 9min 28s


In [92]:
d_x1 = tr_x1[tr_id]
v_x1 = tr_x1[v_id]

In [93]:
tr_id.shape

(1466844,)

In [94]:
np.max(list(tr_id))

1481660

In [95]:
np.max(list(dtrain.index))

1481660

In [96]:
tr_x1.shape

(1481661, 1000)

In [97]:
%%time
cv = CountVectorizer(min_df=NAME_MIN_DF, stop_words='english')
d_x11 = cv.fit_transform(dtrain['name'])
v_x11 = cv.transform(dvalid['name'])
t_x11 = cv.transform(test['name'])

CPU times: user 35.7 s, sys: 0 ns, total: 35.7 s
Wall time: 35.6 s


In [None]:
%%time
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                                         ngram_range=(1, 3),
                                         stop_words='english')
tv = tv.fit(merge['item_description'])
tr_x2 = tv.transform(train['item_description'])
# n_components = 10000
# pca = TruncatedSVD(n_components)
# tr_x2 = pca.fit_transform(tr_x2)

# d_x2 = tv.fit_transform(dtrain['item_description'])
# v_x2 = tv.transform(dvalid['item_description'])
# t_x2 = tv.transform(test['item_description'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))
# print(X_description.shape)
# print(np.min(X_description))
# print(np.max(X_description))
# del merge['item_description']
print_memory_usage()

[2144.9228768348694] Finished TFIDF vectorize `item_description`
cpu: 20.7
consuming 49.37GB RAM
CPU times: user 8min 13s, sys: 1.7 s, total: 8min 15s
Wall time: 8min 15s


In [None]:
d_x2 = tr_x2[tr_id]
v_x2 = tr_x2[v_id]

In [None]:
tr_x2.shape

In [None]:
d_x1.shape

In [43]:
v_x1.shape

(14817, 1000)

In [44]:
#EMBEDDINGS MAX VALUE
#Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 20 #17
MAX_ITEM_DESC_SEQ = 60 #269
MAX_CATEGORY_NAME_SEQ = 20 #8
MAX_TEXT = np.max([np.max(train.seq_name.max())
                   , np.max(test.seq_name.max())
                   , np.max(train.seq_category_name.max())
                   , np.max(test.seq_category_name.max())
                   , np.max(train.seq_item_description.max())
                   , np.max(test.seq_item_description.max())])+2
MAX_CATEGORY = np.max([train.category.max(), test.category.max()])+1
MAX_BRAND = np.max([train.brand.max(), test.brand.max()])+1
MAX_CONDITION = np.max([train.item_condition_id.max(), 
                        test.item_condition_id.max()])+1

print('[{}] Finished EMBEDDINGS MAX VALUE...'.format(time.time() - start_time))
print_memory_usage()

[4434.248512744904] Finished EMBEDDINGS MAX VALUE...
cpu: 3.5
consuming 28.41GB RAM


In [45]:
#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset, f_train=1):
    x1 = d_x1
    if f_train == 0:
        x1 = v_x1
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description
                                    , maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand': np.array(dataset.brand)
        ,'category': np.array(dataset.category)
        ,'category_name': pad_sequences(dataset.seq_category_name
                                        , maxlen=MAX_CATEGORY_NAME_SEQ)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'x1': x1
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid, 0)
X_test = get_keras_data(test)

print('[{}] Finished DATA PREPARARTION...'.format(time.time() - start_time))
print_memory_usage()


[4511.095053434372] Finished DATA PREPARARTION...
cpu: 10.4
consuming 30.39GB RAM


In [46]:
# X_train['x1'].shape

In [47]:
# X_valid['x1'].shape

In [57]:
dr = 0.2

def get_model():
    #params
    dr_r = dr
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand = Input(shape=[1], name="brand")
    category = Input(shape=[1], name="category")
    
    category_name = Input(shape=[X_train["category_name"].shape[1]], 
                          name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    x1 = Input(shape=[X_train["x1"].shape[1]], name="x1")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_size = 60
    
    emb_name = Embedding(MAX_TEXT, emb_size//3)(name)
    emb_item_desc = Embedding(MAX_TEXT, emb_size)(item_desc)
    emb_category_name = Embedding(MAX_TEXT, emb_size//3)(category_name)
    emb_brand = Embedding(MAX_BRAND, 10)(brand)
    emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    rnn_layer1 = GRU(24) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_category_name)
    rnn_layer3 = GRU(20) (emb_name)
    
    #main layer
    main_l = concatenate([
        Flatten() (emb_brand)
        , Flatten() (emb_category)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , rnn_layer3
        , x1
        , num_vars
    ])
    main_l = Dropout(0.1)(Dense(1024,activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(64,activation='relu') (main_l))
#     main_l = Dropout(dr)(Dense(32,activation='relu') (main_l))
    
    #output
    output = Dense(1,activation="linear") (main_l)
    
    #model
#     model = Model([name, item_desc, brand
#                    , category, category_name
#                    , item_condition, x1, num_vars], output)
    
    model = Model([name, item_desc, brand
                   , category, category_name
                   , item_condition, x1, num_vars], output)
    #optimizer = optimizers.RMSprop()
    optimizer = optimizers.Adam()
    model.compile(loss="mse", 
                  optimizer=optimizer)
    return model

def eval_model1(model):
    val_preds = model.predict(X_valid)
    val_preds = np.expm1(val_preds)
    
    y_true = np.array(dvalid.price.values)
    y_pred = val_preds[:, 0]
    v_rmsle = rmsle(y_true, y_pred)
    print(" RMSLE error on dev test: "+str(v_rmsle))
    return v_rmsle


#fin_lr=init_lr * (1/(1+decay))**(steps-1)
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1

print('[{}] Finished DEFINEING MODEL...'.format(time.time() - start_time))
print_memory_usage()


[5826.686835050583] Finished DEFINEING MODEL...
cpu: 55.2
consuming 30.93GB RAM


In [58]:
X_train['name'].shape

(1466844, 20)

In [59]:
X_train['category'][0]

1279

In [60]:
1466844/1536

954.9765625

In [61]:
1466844 - (954 * 1536)

1500

In [62]:
1466844//512

2864

In [63]:
# import threading
# import multiprocessing
# # the following functions allow for a parallelized batch generator
# class threadsafe_iter(object):
#     """
#     Takes an iterator/generator and makes it thread-safe by
#     serializing call to the `next` method of given iterator/generator.
#     """
#     def __init__(self, it):
#         self.it = it
#         self.lock = threading.Lock()
#     def __iter__(self):
#         return self

#     def __next__(self):
#         with self.lock:
#             return next(self.it)

# def threadsafe_generator(f):
#     """
#     A decorator that takes a generator function and makes it thread-safe.
#     """
#     def g(*a, **kw):
#         return threadsafe_iter(f(*a, **kw))
#     return g

# @threadsafe_generator
# def batch_generator(X, y_data, batch_size):
    
#     #index = np.random.permutation(X_data.shape[0])    
#     #X_data = X_data[index]
#     #y_data = y_data[index]
    
#     samples_per_epoch =X['name'].shape[0]
#     number_of_batches = samples_per_epoch//batch_size
#     n_batches_for_epoch = number_of_batches
#     counter=0
#     index = np.arange(np.shape(y_data)[0])
#     #idx = 1
#     while 1:
# #         try:
#         for i in range(n_batches_for_epoch):
#             index_batch = index[batch_size*counter:batch_size*(counter+1)]

#             #         print(index_batch)
#             name  = X['name'][index_batch,:] 
#             item_desc = X['item_desc'][index_batch,:]
#             brand = X['brand'][index_batch].reshape(batch_size,1) 
#             category = X['category'][index_batch].reshape(batch_size,1) 
#             category_name = X['category_name'][index_batch,:]
#             item_condition = X['item_condition'][index_batch].reshape(batch_size,1) 
#     #         arr = np.zeros(1536, 17405)
#     #                 arr = X['x1'][index_batch,:].toarray()
#     #                 arr = X['x1'][index_batch,:]
#             num_vars = X['num_vars'][index_batch,:]
#     #         X_batch = X[index_batch,:].todense()
#     #         print(name.shape, item_desc.shape, brand.shape, category.shape, category_name.shape, item_condition.shape, x1.shape, num_vars.shape)
#     #         X = np.hstack([name,item_desc,brand,category,category_name,item_condition,x1,num_vars])
#             y_batch = y[batch_size*i:batch_size*(i+1)]
#     #         yield(np.array(X_batch),y_batch)
#     #                 yield([name,item_desc,brand,category,category_name,item_condition,arr,num_vars],y_batch)
#             yield([name,item_desc,brand,category,category_name,item_condition,num_vars],y_batch)
#             #print("")
#             #print(X_batch.shape)
#             #print("")
#             #print('generator yielded a batch %d' % idx)
#             #idx += 1
#             if (counter > number_of_batches):
#                 counter=0
# #         except:
# #             pass
            
# @threadsafe_generator
# def val_generator(X,batch_size):
#     samples_per_epoch = X['name'].shape[0]
#     number_of_batches = samples_per_epoch//batch_size
#     n_batches_for_epoch = number_of_batches
#     counter=0
#     index = np.arange(np.shape(X['name'])[0])
#     while 1:
# #         try:
#         for i in range(n_batches_for_epoch):
#             index_batch = index[batch_size*counter:batch_size*(counter+1)]

#             #         print(index_batch)
#             name  = X['name'][index_batch,:] 
#             item_desc = X['item_desc'][index_batch,:]
#             brand = X['brand'][index_batch].reshape(batch_size,1) 
#             category = X['category'][index_batch].reshape(batch_size,1) 
#             category_name = X['category_name'][index_batch,:]
#             item_condition = X['item_condition'][index_batch].reshape(batch_size,1) 
#     #         arr = np.zeros(1536, 17405)
#     #                 arr = X['x1'][index_batch,:].toarray()
#     #                 arr = X['x1'][index_batch,:]
#             num_vars = X['num_vars'][index_batch,:]
#     #         X_batch = X[index_batch,:].todense()
#     #         print(name.shape, item_desc.shape, brand.shape, category.shape, category_name.shape, item_condition.shape, x1.shape, num_vars.shape)
#     #         X = np.hstack([name,item_desc,brand,category,category_name,item_condition,x1,num_vars])
# #             y_batch = y[batch_size*i:batch_size*(i+1)]
#     #         yield(np.array(X_batch),y_batch)
#     #                 yield([name,item_desc,brand,category,category_name,item_condition,arr,num_vars],y_batch)
#             yield([name,item_desc,brand,category,category_name,item_condition,num_vars])
#             if (counter > number_of_batches):
#                 counter=0
# #         except:
# #             pass
        


In [64]:
%%time
gc.collect()
#FITTING THE MODEL
epochs = 3
BATCH_SIZE = 512 * 3
# BATCH_SIZE = 32
steps = int(len(X_train['name'])//BATCH_SIZE) * epochs
lr_init, lr_fin = 0.013, 0.009
lr_decay = exp_decay(lr_init, lr_fin, steps)
log_subdir = '_'.join(['ep', str(epochs),
                    'bs', str(BATCH_SIZE),
                    'lrI', str(lr_init),
                    'lrF', str(lr_fin),
                    'dr', str(dr)])


model = get_model()
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

earlystop = EarlyStopping(monitor='mse', min_delta=0.0001, patience=1, \
                          verbose=1, mode='auto')

def batch_generator(X, y, batch_size):
    n_batches_for_epoch = X['name'].shape[0]//batch_size
    counter=0
    while 1:
        try:
            counter += 1
            for i in range(n_batches_for_epoch):
        #         print(i)
                index_batch = range(batch_size*i,batch_size*(i+1))
        #         print(index_batch)
                name  = X['name'][index_batch,:] 
                item_desc = X['item_desc'][index_batch,:]
                brand = X['brand'][index_batch].reshape(batch_size,1) 
                category = X['category'][index_batch].reshape(batch_size,1) 
                category_name = X['category_name'][index_batch,:]
                item_condition = X['item_condition'][index_batch].reshape(batch_size,1) 
        #         arr = np.zeros(1536, 17405)
#                 arr = X['x1'][index_batch,:].toarray()
                arr = X['x1'][index_batch,:]
                num_vars = X['num_vars'][index_batch,:]
        #         X_batch = X[index_batch,:].todense()
        #         print(name.shape, item_desc.shape, brand.shape, category.shape, category_name.shape, item_condition.shape, x1.shape, num_vars.shape)
        #         X = np.hstack([name,item_desc,brand,category,category_name,item_condition,x1,num_vars])
                y_batch = y[batch_size*i:batch_size*(i+1)].values
        #         yield(np.array(X_batch),y_batch)
                yield([name,item_desc,brand,category,category_name,item_condition,arr,num_vars],y_batch)
#                 yield([name,item_desc,brand,category,category_name,item_condition,num_vars],y_batch)
            if counter > n_batches_for_epoch:
                counter = 0
        except:
            pass

# model.fit_generator(generator=batch_generator(X_train, dtrain.target, BATCH_SIZE),
#     nb_epoch=epochs, 
#     samples_per_epoch=X_train['name'].shape[0])
# model.fit_generator(generator=batch_generator(X_train, dtrain.target, BATCH_SIZE),
#                     workers=4, 
#                     steps_per_epoch=1024, #samples_per_epoch=1024,
#                     max_queue_size=128,
#                     epochs=3, 
#                     verbose=1,
#                     use_multiprocessing=False
#                    )
model.fit_generator(generator=batch_generator(X_train, dtrain.target, BATCH_SIZE), 
                    samples_per_epoch = len(X_train['name'])//(BATCH_SIZE), nb_epoch =epochs, verbose=1,
                    validation_data=batch_generator(X_valid, dvalid.target, BATCH_SIZE),
                    nb_val_samples=len(X_valid['name'])//(BATCH_SIZE))

# history = model.fit_generator(X_train, dtrain.target
#                     , epochs=epochs
#                     , batch_size=BATCH_SIZE
#                     , validation_split=0.01
                    #, callbacks=[TensorBoard('./logs/'+log_subdir)]
#                     , verbose=10
#                     )
print('[{}] Finished FITTING MODEL...'.format(time.time() - start_time))




Epoch 1/3
Epoch 2/3
Epoch 3/3
[6878.05631685257] Finished FITTING MODEL...
CPU times: user 2h 15min 20s, sys: 18min 55s, total: 2h 34min 15s
Wall time: 17min 26s


Exception ignored in: <generator object batch_generator at 0x7f26e10924c0>
RuntimeError: generator ignored GeneratorExit
Exception ignored in: <generator object batch_generator at 0x7f26e1092780>
RuntimeError: generator ignored GeneratorExit


In [65]:
%%time
def val_generator(X, steps=1):
    batch_size = steps
    n_batches_for_epoch = X['name'].shape[0]//batch_size
    counter=0
    while 1:
        try:
            counter += 1
            for i in range(n_batches_for_epoch):
        #         print(i)
                index_batch = range(batch_size*i,batch_size*(i+1))
        #         print(index_batch)
                name  = X['name'][index_batch,:] 
                item_desc = X['item_desc'][index_batch,:]
                brand = X['brand'][index_batch].reshape(batch_size,1) 
                category = X['category'][index_batch].reshape(batch_size,1) 
                category_name = X['category_name'][index_batch,:]
                item_condition = X['item_condition'][index_batch].reshape(batch_size,1) 
        #         arr = np.zeros(1536, 17405)
#                 arr = X['x1'][index_batch,:].toarray()
                arr = X['x1'][index_batch,:]
                num_vars = X['num_vars'][index_batch,:]
        #         X_batch = X[index_batch,:].todense()
        #         print(name.shape, item_desc.shape, brand.shape, category.shape, category_name.shape, item_condition.shape, x1.shape, num_vars.shape)
        #         X = np.hstack([name,item_desc,brand,category,category_name,item_condition,x1,num_vars])
#                 y_batch = y[batch_size*i:batch_size*(i+1)].values
        #         yield(np.array(X_batch),y_batch)
                yield([name,item_desc,brand,category,category_name,item_condition,arr,num_vars])
#                 yield([name,item_desc,brand,category,category_name,item_condition,num_vars])
            if counter > n_batches_for_epoch:
                counter = 0
        except:
            pass
s = X_valid['name'].shape[0]//BATCH_SIZE
print(s)        
def eval_model(model):
    val_preds = model.predict_generator(val_generator(X_valid, 1), steps=X_valid['name'].shape[0])
#     val_preds = model.evaluate_generator(batch_generator(X_valid, dvalid.target, BATCH_SIZE), steps=steps)
#     return val_preds
#     val_preds = val_preds[:len(X_valid['name'])]
    val_preds = np.expm1(val_preds)
    
    y_true = np.array(dvalid.price.values)
    y_pred = val_preds[:, 0]
    print(len(y_true), len(y_pred))
    v_rmsle = rmsle(y_true, y_pred)
    print(" RMSLE error on dev test: "+str(v_rmsle))
    return v_rmsle

#EVLUEATE THE MODEL ON DEV TEST
v_rmsle = eval_model(model)
print('[{}] Finished predicting valid set...'.format(time.time() - start_time))


9
14817 14817
 RMSLE error on dev test: 0.43980186298048507
[6938.738924026489] Finished predicting valid set...
CPU times: user 3min 9s, sys: 59.3 s, total: 4min 8s
Wall time: 1min


Exception ignored in: <generator object val_generator at 0x7f26e2790fc0>
RuntimeError: generator ignored GeneratorExit


In [None]:
# after index fix:
# RMSLE error on dev test: 0.43980186298048507

# with x1 = d_x1
# RMSLE error on dev test: 0.45632179812724843
# with x1 = d_x11
# RMSLE error on dev test: 0.44313618817750944

# GRU 24, 8, 20, dropout 0.1
#  RMSLE error on dev test: 0.43742587571401953
# [21195.142282009125] Finished predicting valid set...
# CPU times: user 2min 55s, sys: 55.3 s, total: 3min 50s
# Wall time: 56 s
    
# GRU 24, 8, 20, dropout 0.25
# RMSLE error on dev test: 0.4409836531837504
# [18314.209129810333] Finished predicting valid set...
# CPU times: user 3min 20s, sys: 55.4 s, total: 4min 15s
# Wall time: 1min 41s

In [None]:
print(" RMSLE error on dev test: "+str(v_rmsle))

 RMSLE error on dev test: 0.44313618817750944


In [None]:
#CREATE PREDICTIONS
preds = model.predict(X_test, batch_size=BATCH_SIZE)
preds = np.expm1(preds)
print('[{}] Finished predicting test set...'.format(time.time() - start_time))
submission = test[["test_id"]][:test_len]
submission["price"] = preds[:test_len]
submission.to_csv("../submissions/sub"+log_subdir+"_{:.6}.csv".format(v_rmsle), index=False)
print('[{}] Finished submission...'.format(time.time() - start_time))

ValueError: setting an array element with a sequence.