In [None]:
from multiprocessing.pool import ThreadPool

import re
import numpy as np
import pandas as pd

from scipy import sparse as sp
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Dense, Conv1D, GlobalMaxPool1D, GlobalAvgPool1D, Concatenate,\
Embedding, Lambda, Dropout, BatchNormalization
from keras.models import Model
from keras import backend as K
from keras import optimizers, activations
import tensorflow as tf

In [None]:
MAX_FEATURES_NAME = 75000
MAX_FEATURES_DESC = 125000
MAX_TEXT_NAME = 40000
MAX_TEXT_DESC = 80000
MAX_SEQ_NAME = 20
MAX_SEQ_DESC = 64
MIN_DF_NAME = 2
MIN_DF_DESC = 80
EPOCHS = 3
BATCH_SIZE = 512 * 8
VERBOSE = 1
VALID_SPLIT = 0.0

models = []
pool = ThreadPool(processes=4)

In [None]:
# For features
def reformat_name(text):
    REPLACE = [
        (re.compile(r"[^a-z0-9\.\"]"), r" "),
        (re.compile(r"([a-z]+)"), r" \1 "),
    ]
    
    text = text.lower()
    text_2 = " " + text
    for regexp, substitution in REPLACE:
        text_2 = regexp.sub(substitution, text_2)
        
    return text, text_2

def reformat_desc(text):
    return reformat_name(text)[1]

def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("Other", "Other", "Other")

def handel_df_inplace(df):
    df['name_1'] = df['name'].str.get(0).fillna('missing')
    df['name_2'] = df['name'].str.get(1).fillna('missing')
    df.drop('name', axis=1, inplace=True)
    df['item_condition_id'] = df['item_condition_id'].cat.add_categories(['missing']).fillna('missing')
    df['gencat_name'] = df['category_name'].str.get(0).replace('', 'Other').astype('category')
    df['subcat1_name'] = df['category_name'].str.get(1).fillna('Other').astype('category')
    df['subcat2_name'] = df['category_name'].str.get(2).fillna('Other').astype('category')
    df.drop('category_name', axis=1, inplace=True)
    df['brand_name'] = df['brand_name'].fillna('missing').astype('category')
    df['shipping'] = df['shipping'].cat.add_categories(['missing']).fillna('missing')
    df['item_description'] = df['item_description'].fillna('missing')
    df['item_description'] = df.item_description.apply(lambda x: 'missing' if x=='no description yet' else x)
    df['desc_len'] = df.item_description.apply(lambda x: 0 if x=='missing' else len(x.split())).astype('category')

    return None

# For models
def rmsle(y_true, y_pred):
    return np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2) ** 0.5

def pred_models(models, X, batch_size=2048, target_to_price_func=np.expm1):
    val_preds = [model.predict(X, batch_size=batch_size) for model in models]
    val_preds = sum(val_preds)/len(val_preds)
    val_preds = target_to_price_func(val_preds)
    
    return val_preds

def eval_models(models, X_valid, y_true, batch_size=2048, target_to_price_func=np.expm1):
    val_preds = pred_models(models, X_valid, batch_size=batch_size, target_to_price_func=target_to_price_func)
    
    return rmsle(y_true, val_preds[:, 0])

def exp_decay(init, fin, steps):
    return (init/fin)**(1/(steps-1)) - 1

def get_model_1(output_func=None):
    vec = Input(shape=[X_train["vec"].shape[1]], name="vec", sparse=True)
    
    h_layer = Dropout(0.1) (Dense(128, activation=lambda x: activations.elu(x, alpha=.5)) (vec))
    h_layer = Dense(64, activation=activations.elu(x, alpha=.5)) (h_layer)
    h_layer = Dense(64, activation=activations.linear) (h_layer)
    
    if output_func is None:
        output = Lambda(lambda x: K.mean(K.tanh(x), axis=1, keepdims=True)*4+4) (h_layer)
    else:
        output = Lambda(output_func) (h_layer)
        
    model = Model([vec], output)
    model.compile(loss="mse",  optimizer=optimizers.adam(beta_1=0.9, beta_2=0.9))
    
    return model

def get_model_2(output_func=None):
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    sub_vec = Input(shape=[X_train["sub_vec"].shape[1]], name="vec", sparse=True)
    
    name_layer = Embedding(MAX_TEXT_NAME, 96) (name)
    name_layer = GlobalMaxPool1D() (name_layer)
    
    item_desc_layer = Embedding(MAX_TEXT_DESC, 64) (item_desc)
    item_desc_layer = Conv1D(3, 1) (item_desc_layer)
    item_desc_layer = GlobalAvgPool1D() (item_desc_layer)
    
    vec_layer = Dropout(.1) (Dense(32, activation=lambda x: activations.elu(x, alpha=.5)) (sub_vec))
    
    h_layer = Concatenate() ([
        name_layer,
        item_desc_layer,
        vec_layer
    ])
    
    h_layer = Dense(64, activation=lambda x: activations.elu(x, alpha=.5)) (h_layer)
    h_layer = Dense(64) (h_layer)
    
    if output_func is None:
        output = Lambda(lambda x: K.mean(K.tanh(x), axis=1, keepdims=True)*4+4) (h_layer)
    else:
        output = Lambda(output_func) (h_layer)
        
    model = Model([name, item_desc, vec], output)
    model.compile(loss="mse",  optimizer=optimizers.adam(beta_1=0.9, beta_2=0.9))
    
    return model

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field, dtype=None):
        self.field = field
        self.dtype = dtype

    def fit(self, x, y=None):
        return self

    def transform(self, dataframe):
        if self.dtype == 'category':
            return dataframe[self.field].cat.codes[:, None]
        else:
            return dataframe[self.field]

In [None]:
def thread():
    train = pd.read_table('../input/train.tsv', engine='c', 
                          dtype={'item_condition_id': 'category', 'shipping': 'category'},
                          converters={'category_name': split_cat, 'name': reformat_name,
                                      'item_description': reformat_desc})
    test_ = pd.read_table('../input/test.tsv', engine='c', 
                          dtype={'item_condition_id': 'category', 'shipping': 'category'},
                          converters={'category_name': split_cat, 'name': reformat_name,
                                      'item_description': reformat_desc})
    
    train_df, dev_df = train_test_split(train, random_state=123, train_size=0.99)
    train_df = train_df.loc[train_df.price > 0]
    train_df['brand_name'] = train_df['brand_name'].fillna('missing').astype('category')
    test_['brand_name'] = test_.brand_name.apply(lambda x: x if x in train_df.brand_name.cat.categories else 'missing').astype('category')
    n_trains, n_devs = train_df.shape[0], dev_df.shape[0]
    merge = pd.concat([train_df, dev_df, test_])
    handel_df_inplace(merge)
    
    return merge, n_trains, n_devs

merge, n_trains, n_devs = pool.apply_async(thread).get()

def thread():
    vec = FeatureUnion([
        ('name_1', Pipeline([
            ('selector', ItemSelector(field='name_1')),
            ('cv', CountVectorizer(max_features=MAX_FEATURES_NAME)),
        ])),
        ('item_description', Pipeline([
            ('selector', ItemSelector(field='item_description')),
            ('cv', CountVectorizer(max_features=MAX_FEATURES_DESC)),
            ('tfidf', TfidfTransformer(max_features=MAX_FEATURES_DESC))
        ])),
        ('item_condition_id', Pipeline([
            ('selector', ItemSelector(field='item_condition_id', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('shipping', Pipeline([
            ('selector', ItemSelector(field='shipping', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('gencat_name', Pipeline([
            ('selector', ItemSelector(field='gencat_name', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('subcat1_name', Pipeline([
            ('selector', ItemSelector(field='subcat1_name', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('subcat2_name', Pipeline([
            ('selector', ItemSelector(field='subcat2_name', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('brand_name', Pipeline([
            ('selector', ItemSelector(field='brand_name', dtype='category')),
            ('ohe', OneHotEncoder())
        ])),
        ('desc_len', Pipeline([
            ('selector', ItemSelector(field='desc_len', dtype='category')),
            ('ohe', OneHotEncoder(handle_unknown='ignore', dtype=np.int8))
        ]))
    ], n_jobs=-1).fit_transform(merge)
    
    train_vec = vec[:n_trains]
    dev_vec = vec[n_trains:n_trains+n_devs]
    test_vec = vec[n_trains+n_devs:]
    
    return train_vec, dev_vec, test_vec

train_vec, dev_vec, test_vec = pool.apply_async(thread).get()
train_sub_vec = train_vec[MAX_FEATURES_NAME+MAX_FEATURES_DESC*2:]
dev_sub_vec = dev_vec[MAX_FEATURES_NAME+MAX_FEATURES_DESC*2:]
test_sub_vec = test_vec[MAX_FEATURES_NAME+MAX_FEATURES_DESC*2:]

dtrain = pd.DataFrame(index=merge[:n_trains].index)
ddev = pd.DataFrame(index=merge[n_trains:n_trains+n_devs].index)
dtest = pd.DataFrame(index=merge[n_trains+n_devs:].index)
dtrain['target'] = np.log1p(merge[:n_trains].price)
ddev['price'] = merge[n_trains:n_trains+n_devs].price

def thread():
    tok_raw_name = Tokenizer(num_words=MAX_TEXT_NAME, filters='', lower=False)
    tok_raw_name.fit_on_texts(merge.name_2)
    tok_raw_desc = Tokenizer(num_words=MAX_TEXT_DESC, filters='', lower=False)
    tok_raw_desc.fit_on_texts(merge.item_description)

    dtrain["seq_name"] = tok_raw_name.texts_to_sequences(merge[:n_trains].name_2)
    dtrain["seq_item_description"] = tok_raw_desc.texts_to_sequences(merge[:n_trains].item_description)
    ddev["seq_name"] = tok_raw_name.texts_to_sequences(merge[n_trains:n_trains+n_devs].name_2)
    ddev["seq_item_description"] = tok_raw_desc.texts_to_sequences(merge[n_trains:n_trains+n_devs].item_description)
    dtest["seq_name"] = tok_raw_name.texts_to_sequences(merge[n_trains+n_devs:].name_2)
    dtest["seq_item_description"] = tok_raw_desc.texts_to_sequences(merge[n_trains+n_devs:].item_description)
    
    return None

pool.apply_async(thread).get()

X_train = {
        'name': pad_sequences(dtrain['seq_name'], maxlen=MAX_SEQ_NAME),
        'item_desc': pad_sequences(dtrain['seq_item_description'], maxlen=MAX_SEQ_DESC),
        'vec': train_vec,
        'sub_vec' train_sub_vec,
    }
X_dev = {
        'name': pad_sequences(ddev['seq_name'], maxlen=MAX_SEQ_NAME),
        'item_desc': pad_sequences(ddev['seq_item_description'], maxlen=MAX_SEQ_DESC),
        'vec': dev_vec,
        'sub_vec' dev_sub_vec,
    }
X_test = {
        'name': pad_sequences(dtest['seq_name'], maxlen=MAX_SEQ_NAME),
        'item_desc': pad_sequences(dtest['seq_item_description'], maxlen=MAX_SEQ_DESC),
        'vec': test_vec,
        'sub_vec' test_sub_vec,
    }

# Models

In [None]:
for lr_i, lr_f, func in [(0.025, 0.005, lambda x: K.mean(K.sigmoid(x), axis=1, keepdims=True)*8),
                         (0.025, 0.005, lambda x: K.mean(x/(1+K.abs(x)), axis=1, keepdims=True)*4+4),
                         (0.02, 0.002, lambda x: K.mean(x/(1+x**2)**0.5, axis=1, keepdims=True)*4+4),
                         (0.017, 0.0017, lambda x: K.mean(K.tanh(x), axis=1, keepdims=True)*4+4)]:
    lr_decay = exp_decay(lr_f, lr_f, steps=int(X_train['vec_2'].shape[0]/BATCH_SIZE)*EPOCHS)

    model = get_model_1(func)
    model.optimizer = optimizers.adam(beta_1=0.9, beta_2=0.9)
    model.optimizer.lr = lr_f
    model.optimizer.decay = lr_decay

    try:
        model.fit(X_train, dtrain.target,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  validation_split=VALID_SPLIT,
                  verbose=VERBOSE
                 )
        print('[RMSLE] {}'.format(eval_models([model], X_dev, ddev.price, batch_size=BATCH_SIZE)))
        models.append(model)
        print('[ENS RMSLE] {}'.format(eval_models(models, X_dev, ddev.price, batch_size=BATCH_SIZE)))
    except:
        print('THIS IS STRANGE !!!')

In [None]:
for lr_i, lr_f, func in [(0.009, 0.001, lambda x: K.mean(x/(1+K.abs(x)), axis=1, keepdims=True)*4+4),
                         (0.009, 0.001, lambda x: K.mean(x/(1+x**2)**0.5, axis=1, keepdims=True)*4+4),
                         (0.009, 0.001, lambda x: K.mean(K.tanh(x), axis=1, keepdims=True)*4+4)]:
    lr_decay = exp_decay(lr_i, lr_f, steps=int(X_train['vec_2'].shape[0]/BATCH_SIZE)*EPOCHS)

    model = get_model_2(func)
    model.optimizer = optimizers.adam(beta_1=0.9, beta_2=0.9)
    model.optimizer.lr = lr_i
    model.optimizer.decay = lr_decay

    try:
        model.fit(X_train, dtrain.target,
                  epochs=EPOCHS,
                  batch_size=BATCH_SIZE,
                  validation_split=VALID_SPLIT,
                  verbose=VERBOSE
                 )
        print('[RMSLE] {}'.format(eval_models([model], X_dev, ddev.price, batch_size=BATCH_SIZE)))
        models.append(model)
        print('[ENS RMSLE] {}'.format(eval_models(models, X_dev, ddev.price, batch_size=BATCH_SIZE)))
    except:
        print('THIS IS STRANGE !!!')

# Submission

In [None]:
submission: pd.DataFrame = merge[n_trains+n_devs:][['test_id']].astype(np.int64)
submission['price'] = pred_models(models, X_test)
submission.to_csv("submission_hope_26.csv", index=False)