In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import time
from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder, Normalizer
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

import sys
import os
import random
import numpy as np
from keras import backend as K

from nltk.corpus import stopwords
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
#!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z

In [None]:
NUM_BRANDS = 4500
NUM_CATEGORIES = 1250

In [None]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))

In [None]:
train_df = pd.read_table("train.tsv")
test_df = pd.read_csv("../input/mercari-price-suggestion-challenge/test_stg2.tsv.zip" , sep='\t')
# test_df = pd.read_table("test.tsv")

In [None]:
def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("missing", "missing", "missing")

In [None]:
def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='missing', inplace=True)
    dataset['subcat_1'].fillna(value='missing', inplace=True)
    dataset['subcat_2'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='No description yet', inplace=True)

In [None]:
def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'

In [None]:
def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

In [None]:
# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

In [None]:
# get name and description lengths
def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0

In [None]:
def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])


In [None]:
def normalize_dataset_text(dataset):
    dataset['item_description'] = dataset['item_description'].apply(lambda x: normalize_text(x))
    dataset['brand_name'] = dataset['brand_name'].apply(lambda x: normalize_text(x))

In [None]:
def delete_unseen(dataset):
    dataset.loc[~dataset['brand_name'].isin(all_brand), 'brand_name'] = 'missing'
    dataset.loc[~dataset['general_cat'].isin(all_general_cat), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(all_subcat_1), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(all_subcat_2), 'subcat_2'] = 'missing'

In [None]:
def text_length_feature(dataset, train = True):
    if train:
        dataset['desc_len'] = dataset['item_description'].apply(lambda x: wordCount(x))
        dataset['name_len'] = dataset['name'].apply(lambda x: wordCount(x))
        dataset[['desc_len', 'name_len']] = desc_normalizer.fit_transform(dataset[['desc_len', 'name_len']])
    else:
        dataset['desc_len'] = dataset['item_description'].apply(lambda x: wordCount(x))
        dataset['name_len'] = dataset['name'].apply(lambda x: wordCount(x))
        dataset[['desc_len', 'name_len']] = desc_normalizer.transform(dataset[['desc_len', 'name_len']])

In [None]:
start_time = time.time()
from time import gmtime, strftime
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

In [None]:
train_df, dev_df = train_test_split(train_df, random_state=200, train_size=0.70)

In [None]:
train_df = train_df.drop(train_df[(train_df.price < 1.0)].index)
train_y = np.log1p(train_df["price"])
dev_y = np.log1p(dev_df["price"])

In [None]:
train_df['general_cat'], train_df['subcat_1'], train_df['subcat_2'] = \
    zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
train_df.drop('category_name', axis=1, inplace=True)
print('[{}] Split categories completed.'.format(time.time() - start_time))

handle_missing_inplace(train_df)
print('[{}] Handle missing completed.'.format(time.time() - start_time))

cutting(train_df)
print('[{}] Cut completed.'.format(time.time() - start_time))

to_categorical(train_df)
print('[{}] Convert categorical completed'.format(time.time() - start_time))




In [None]:
train_df.head(5)

In [None]:
desc_normalizer = Normalizer()
name_normalizer = Normalizer()
text_length_feature(train_df)
print('[{}] Calculate length features'.format(time.time() - start_time))

normalize_dataset_text(train_df)
print('[{}] Normalization text'.format(time.time() - start_time))

In [None]:
## get all categorical in train and replace missing value
all_brand = set(train_df["brand_name"].values)
all_general_cat = set(train_df["general_cat"].values)
all_subcat_1 = set(train_df["subcat_1"].values)
all_subcat_2 = set(train_df["subcat_2"].values)

le_brand = LabelEncoder()
le_general_cat = LabelEncoder()
le_subcat_1 = LabelEncoder()
le_subcat_2 = LabelEncoder()

le_brand.fit(train_df['brand_name'])
train_df['encoded_brand_name'] = le_brand.transform(train_df['brand_name'])

le_general_cat.fit(train_df['general_cat'])
train_df['encoded_general_cat'] = le_general_cat.transform(train_df['general_cat'])

le_subcat_1.fit(train_df['subcat_1'])
train_df['encoded_subcat_1'] = le_subcat_1.transform(train_df['subcat_1'])

le_subcat_2.fit(train_df['subcat_2'])
train_df['encoded_subcat_2'] = le_subcat_2.transform(train_df['subcat_2'])

In [None]:
print("Tokenizing item description")
tok_desc = Tokenizer()
tok_desc.fit_on_texts(train_df["item_description"].values)

print("Tokenizing name")
tok_name = Tokenizer()
tok_name.fit_on_texts(train_df["name"].values)

print("Transforming text to sequences...")
train_df['seq_item_description'] = tok_desc.texts_to_sequences(train_df["item_description"].values)
train_df['seq_name'] = tok_name.texts_to_sequences(train_df["name"].values)


In [None]:
## padding max length
MAX_NAME_SEQ = 15 #17
MAX_ITEM_DESC_SEQ = 50 #269

## embedding max length
MAX_DESC_TEXT = len(tok_desc.word_index) + 1
MAX_NAME_TEXT = len(tok_name.word_index) + 1
MAX_BRAND = len(le_brand.classes_)
MAX_GENCAT = len(le_general_cat.classes_)
MAX_SUBCAT_1 = len(le_subcat_1.classes_)
MAX_SUBCAT_2 = len(le_subcat_2.classes_)
MAX_CONDITION = max(train_df.item_condition_id) + 1

In [None]:
def get_rnn_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ),
        'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ),
        'brand_name': np.array(dataset.encoded_brand_name),
        'item_condition': np.array(dataset.item_condition_id),
        'num_vars': np.array(dataset[["shipping"]]),
        'desc_len': np.array(dataset[["desc_len"]]),
        'name_len': np.array(dataset[["name_len"]]),
        'general_cat': np.array(dataset.encoded_general_cat),
        'subcat_1': np.array(dataset.encoded_subcat_1),
        'subcat_2': np.array(dataset.encoded_subcat_2),
    }
    return X

In [None]:
train_X = get_rnn_data(train_df)

In [None]:
## RNN Model
np.random.seed(123)

def rnn_model(lr=0.001, decay=0.0):
    # Inputs
    name = Input(shape=[MAX_NAME_SEQ], name="name")
    item_desc = Input(shape=[MAX_ITEM_DESC_SEQ], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    general_cat = Input(shape=[1], name="general_cat")
    subcat_1 = Input(shape=[1], name="subcat_1")
    subcat_2 = Input(shape=[1], name="subcat_2")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[1], name="num_vars")
    desc_len = Input(shape=[1], name="desc_len")
    name_len = Input(shape=[1], name="name_len")

    # Embeddings layers (adjust outputs to help model)
    emb_name = Embedding(MAX_NAME_TEXT, 30)(name)
    emb_item_desc = Dropout(0.05) (Embedding(MAX_DESC_TEXT, 60)(item_desc))
    emb_brand_name = Embedding(MAX_BRAND, 20)(brand_name)
    emb_general_cat = Embedding(MAX_GENCAT, 5)(general_cat)
    emb_subcat_1 = Embedding(MAX_SUBCAT_1, 10)(subcat_1)
    emb_subcat_2 = Embedding(MAX_SUBCAT_2, 15)(subcat_2)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
#     emb_shipping = Embedding(2, 5)(num_vars)
    

    # rnn layers (GRUs are faster than LSTMs and speed is important here)
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
    # main layers
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_item_condition)
        , Flatten() (emb_general_cat)
        , Flatten() (emb_subcat_1)
        , Flatten() (emb_subcat_2)
#         , Flatten() (emb_shipping)
        , num_vars
        , rnn_layer1
        , rnn_layer2
        , desc_len
        , name_len
    ])
    # (incressing the nodes or adding layers does not effect the time quite as much as the rnn layers)
    main_l = Dropout(0.05)(Dense(512,kernel_initializer='normal',activation='relu') (main_l))
    main_l = Dropout(0.05)(Dense(96,kernel_initializer='normal',activation='relu') (main_l))

    # the output layer.
    output = Dense(1, activation="linear") (main_l)
    
    model = Model([name, item_desc, brand_name,
                   general_cat, subcat_1, subcat_2,
                   item_condition, num_vars, desc_len, name_len], output)

    optimizer = Adam(lr=lr, decay=decay)
    # (mean squared error loss function works as well as custom functions)  
    model.compile(loss = 'mse', optimizer = optimizer)

    return model

In [None]:
# Set hyper parameters for the model.
BATCH_SIZE = 512 * 4
epochs = 3

# Calculate learning rate decay.
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(train_X['name']) / BATCH_SIZE) * epochs
lr_init, lr_fin = 0.007, 0.002
lr_decay = exp_decay(lr_init, lr_fin, steps)


In [None]:
model = rnn_model(lr=lr_init, decay=lr_decay)

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model.fit(train_X, train_y, epochs=epochs, batch_size=BATCH_SIZE, verbose=2)

In [None]:
dev_df['general_cat'], dev_df['subcat_1'], dev_df['subcat_2'] = \
        zip(*dev_df['category_name'].apply(lambda x: split_cat(x)))
#dev_df.drop('category_name', axis=1, inplace=True)
handle_missing_inplace(dev_df)
cutting(dev_df)
text_length_feature(dev_df)
normalize_dataset_text(dev_df)
delete_unseen(dev_df)
to_categorical(dev_df)

In [None]:
 ## RNN
dev_df['encoded_brand_name'] = le_brand.transform(dev_df['brand_name'])
dev_df['encoded_general_cat'] = le_general_cat.transform(dev_df['general_cat'])
dev_df['encoded_subcat_1'] = le_subcat_1.transform(dev_df['subcat_1'])
dev_df['encoded_subcat_2'] = le_subcat_2.transform(dev_df['subcat_2'])
    
dev_df['seq_item_description'] = tok_desc.texts_to_sequences(dev_df["item_description"].values)
dev_df['seq_name'] = tok_name.texts_to_sequences(dev_df["name"].values)
    
dev_X = get_rnn_data(dev_df)
preds_rnn = model.predict(dev_X)
print("RNN dev RMSLE:", rmsle(np.expm1(dev_y), np.expm1(preds_rnn.flatten())))

In [None]:
test_df['general_cat'], test_df['subcat_1'], test_df['subcat_2'] = \
        zip(*test_df['category_name'].apply(lambda x: split_cat(x)))
test_df.drop('category_name', axis=1, inplace=True)
handle_missing_inplace(test_df)
cutting(test_df)
text_length_feature(test_df)
normalize_dataset_text(test_df)
delete_unseen(test_df)
to_categorical(test_df)

In [None]:
 ## RNN
test_df['encoded_brand_name'] = le_brand.transform(test_df['brand_name'])
test_df['encoded_general_cat'] = le_general_cat.transform(test_df['general_cat'])
test_df['encoded_subcat_1'] = le_subcat_1.transform(test_df['subcat_1'])
test_df['encoded_subcat_2'] = le_subcat_2.transform(test_df['subcat_2'])
    
test_df['seq_item_description'] = tok_desc.texts_to_sequences(test_df["item_description"].values)
test_df['seq_name'] = tok_name.texts_to_sequences(test_df["name"].values)
    
test_X = get_rnn_data(test_df)
preds_rnn_test = model.predict(test_X)

In [None]:
preds_rnn_test = np.expm1(preds_rnn_test.flatten())

In [None]:
submission = test_df[["test_id"]]
submission["price"] = preds_rnn_test

In [None]:
submission.to_csv("submission.csv", index=False)