In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [None]:
train = pd.read_table("train.tsv")

In [None]:
def transform_category_name(category_name):
    try:
        main, sub1, sub2= category_name.split('/')
        return main, sub1, sub2
    except:
        return "none", "none", "none"

train['category_main'], train['subcat_1'], train['subcat_2'] = zip(*train['category_name'].apply(transform_category_name))
train.head(50)

In [None]:
train.isnull().sum()

In [None]:
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

In [None]:
train = handle_missing(train)

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
print("Handling categorical variables...")
le = LabelEncoder()
train.category_main = le.fit_transform(train.category_main)
train.subcat_1 = le.fit_transform(train.subcat_1)
train.subcat_2 = le.fit_transform(train.subcat_2)
train.brand_name = le.fit_transform(train.brand_name)
del le

train.head(3)

In [None]:
from string import punctuation
from nltk import PorterStemmer
from nltk.corpus import stopwords
punctuation_symbols = []
for symbol in punctuation:
    punctuation_symbols.append((symbol, ''))


def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))


def remove_digits(x):
    x = ''.join([i for i in x if not i.isdigit()])
    return x


stop = stopwords.words('english')


def remove_stop_words(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
    return x


def to_lower(x):
    return x.lower()

In [None]:
'''import string
train.item_description = train.item_description.astype(str)
train['item_description'] = train['item_description'].apply(remove_digits)
train['item_description'] = train['item_description'].apply(remove_punctuation)
train['item_description'] = train['item_description'].apply(remove_stop_words)
train['item_description'] = train['item_description'].apply(to_lower)
train['name'] = train['name'].apply(remove_digits)
train['name'] = train['name'].apply(remove_punctuation)
train['name'] = train['name'].apply(remove_stop_words)
train['name'] = train['name'].apply(to_lower)
train.head(50) 
'''

In [None]:
print("Text to seq process...")
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([train.item_description.str.lower(), train.name.str.lower()])

print("   Fitting tokenizer...")
tok_raw = Tokenizer(num_words=3000)
tok_raw.fit_on_texts(raw_text)
print("   Transforming text to seq...")

train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
train.head(3)

In [None]:
max_name_seq = np.max(train.seq_name.apply(lambda x: len(x)))
max_seq_item_description = np.max(train.seq_item_description.apply(lambda x: len(x)))
print("max name seq "+str(max_name_seq))
print("max item desc seq "+str(max_seq_item_description))

In [None]:
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_TEXT = np.max([np.max(train.seq_name.max()),np.max(train.seq_item_description.max())])+2
MAX_CATEGORY =np.max([np.max(train.category_main.max()),np.max(train.subcat_1.max()), np.max(train.subcat_2.max())])+2
MAX_BRAND = train.brand_name.max()+1
MAX_CONDITION = train.item_condition_id.max()+1

In [None]:
train["target"] = np.log(train.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
train["target"] = target_scaler.fit_transform(train.target.values.reshape(-1,1))
pd.DataFrame(train.target).hist()

In [None]:
from sklearn.model_selection import train_test_split
dtrain, dvalid = train_test_split(train, random_state=123, train_size=0.70)
print(dtrain.shape)
print(dvalid.shape)

In [None]:
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand_name': np.array(dataset.brand_name)
        ,'category_main': np.array(dataset.category_main)
        ,'subcat_1': np.array(dataset.subcat_1)
        ,'subcat_2': np.array(dataset.subcat_2)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)

In [None]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K

def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

def get_model():
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category_main = Input(shape=[1], name="category_main")
    subcat_1 = Input(shape=[1], name="subcat_1")
    subcat_2 = Input(shape=[1], name="subcat_2")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    
    #Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)  #embedding comes from word2vector, I is only used in initial layer and it purpose is to recognize possible similarities in the mapped (here 50-dim, 10-dim and 5-dim space)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category_main = Embedding(MAX_CATEGORY, 10)(category_main)
    emb_subcat_1 = Embedding(MAX_CATEGORY, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_CATEGORY, 10)(subcat_2)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    #rnn layer
    
    rnn_layer1 = GRU(16) (emb_item_desc)  #GRU, part of the recurennt NN, notice that we applied it only on the textual data that we transofrem into sequnce ,i.e. numerical
    rnn_layer2 = GRU(8) (emb_name)  # 16,8 and stands for the dimensionality of the output space, i.e. what we are going to "give to the next layer"
    
    #main layer. Note its a keras concatenate, meaning it will merge layers of neural network
    #Role of flatten in keras: Let us say that  "emb_brand_name" has elements of dimension 3x2 for example. To make it 1-d we use flatten
    # Further layers may need 1-d vectors as input
    
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_category_main)
        , Flatten() (emb_subcat_1)
        , Flatten() (emb_subcat_2)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , num_vars
    ])
    main_l = Dropout(0.25) (Dense(128) (main_l))
    main_l = Dropout(0.1) (Dense(64) (main_l))
    
    #output (1 stands for one output neuron and it should tell us the cost of the item)
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand_name
                   , category_main, subcat_1, subcat_2, item_condition, num_vars], output)
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [None]:
from keras.layers import Flatten, Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras.regularizers import Regularizer
from keras.callbacks import TensorBoard, ModelCheckpoint

def get_model():  
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category_main = Input(shape=[1], name="category_main")
    subcat_1 = Input(shape=[1], name="subcat_1")
    subcat_2 = Input(shape=[1], name="subcat_2")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)  #embedding comes from word2vector, I is only used in initial layer and it purpose is to recognize possible similarities in the mapped (here 50-dim, 10-dim and 5-dim space)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category_main = Embedding(MAX_CATEGORY, 10)(category_main)
    emb_subcat_1 = Embedding(MAX_CATEGORY, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_CATEGORY, 10)(subcat_2)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    convs1 = []
    convs2 = []
    
    for filter_length in [1,2]:
        cnn_layer1 = Conv1D(filters=50, kernel_size=filter_length, padding='same', activation='relu', strides=1) (emb_name)
        cnn_layer2 = Conv1D(filters=50, kernel_size=filter_length, padding='same', activation='relu', strides=1) (emb_item_desc)
        
        maxpool1 = GlobalMaxPooling1D() (cnn_layer1)
        maxpool2 = GlobalMaxPooling1D() (cnn_layer2)
        
        convs1.append(maxpool1)
        convs2.append(maxpool2)

    convs1 = concatenate(convs1)
    convs2 = concatenate(convs2)
    
    main_l = concatenate([
            Flatten() (emb_category_main),
            Flatten() (emb_subcat_1),
            Flatten() (emb_subcat_2),
            Flatten() (emb_brand_name),
            Flatten() (emb_item_condition),
            convs1, 
            convs2, 
            num_vars
    ])
    main_l = Dropout(0.25)(Dense(128, activation='relu') (main_l)) #.25 = .435
    main_l = Dropout(0.1)(Dense(64, activation='relu') (main_l)) #.1
    
    # , kernel_regularizer=keras.regularizers.l2(0.01)
    output = Dense(1, activation='linear') (main_l)

    model = Model([name, item_desc, brand_name
                   , category_main, subcat_1, subcat_2, item_condition, num_vars], output)
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [None]:
BATCH_SIZE = 20000
epochs = 5

model = get_model()
model.summary()

In [None]:
model.fit(X_train, dtrain.target, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.target)
          , verbose=1)

In [None]:
import math

val_preds = model.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds)+1

#mean_absolute_error, mean_squared_log_error
y_true = np.array(dvalid.price.values)
y_pred = val_preds[:,0]
v_rmsle = rmsle(y_true, y_pred)
print(" RMSLE error on dev test: "+str(v_rmsle))