In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, CuDNNGRU, Dropout, SpatialDropout1D, CuDNNLSTM, Input, concatenate
from keras.models import Model
import keras.backend as K

import numpy as np
import pandas as pd

import gc 
from avito_functions import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def preprocessing(df_train, df_test, map_dict, add_features=None):

    print('run preprocessing..')
    
    target = 'deal_probability'

    # get labels, merge 
    y = df_train[target].values.squeeze()
    X = df_train.drop([target], 1).append(df_test)
    X.index = np.arange(X.shape[0])

    # map additional information
    X['salaries'] = X.region.map(map_dict['salaries'])
    X['population'] = X.city.map(map_dict['population'])

    # merge additional features
    if not add_features is None:
        X = pd.concat([X, add_features], 1)
    
    # drop useless features 
    X = X.drop(['title', 'item_id', 'user_id'], 1)
   
    category_features = ['region', 'city', 
                         'parent_category_name', 'category_name', 
                         'param_1', 'param_2', 'param_3', 
                         'user_type', 'image_top_1']

    return X, y, category_features


# numeric 

data_keys = ['train', 'valid', 'holdout', 'fulltrain', 'test']

print('Load df')
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

print('Load agg input')
with open('../input/map_dict.pkl', 'rb') as file: map_dict = pickle.load(file)
with open('../input/text_num_features_lemm.pkl', 'rb') as f: X_text_num = pickle.load(f)
sgd = load_fe('sgd2')
ext = load_fe('extra')

n_train = df_train.shape[0]
add_features = X_text_num
X, y, category_features = preprocessing(df_train, df_test, map_dict, add_features)
X, category_features = feature_engineering(X, category_features, factorize=True)

X, X_test = X[:n_train], X[n_train:]

x_train, x_valid, x_holdout, \
y_train, y_valid, y_holdout, \
_, _, _ = validation_split(X, y)

te_groups = []
for f in category_features:
    te_groups.append([f])

te_groups += [['price_log_cut', 'category_name'], 
              ['price_log_cut', 'region'],
              ['price_log_cut', 'param_1'],
              ['region', 'parent_category_name']
             ]

for group in te_groups:
    x_train, x_valid, x_holdout = target_encoding(x_train, y_train, x_valid, group, x_holdout)
    X, X_test = target_encoding(X, y, X_test, group)

# save category features 
for x in [x_train, x_valid, x_holdout, X, X_test]:
    x.drop(category_features, 1, inplace=True)
    print(x.shape, all(x.columns == x_train.columns))
    
## impute 
print('impute numeric')
x_train, x_valid, x_holdout, _ = num_fillna(x_train, x_valid, x_holdout)
X, X_test, _ = num_fillna(X, X_test)

## scale
print('scale numeric')
x_train, x_valid, x_holdout, _ = num_scaling(x_train, x_valid, x_holdout)
X, X_test, _ = num_scaling(X, X_test)

for x, k in zip([x_train, x_valid, x_holdout, X, X_test], data_keys):
    x['sgd'] = sgd[k]
    x['ext'] = ext[k]

Load df
Load agg input
run preprocessing..
run feature engineering..
-- count fraction price_x_region__category_name_frac
-- count fraction price_x_region__param_1_frac
-- count fraction price_x_region__param_2_frac
-- count fraction price_x_region__image_top_1_frac
-- count fraction price_x_city__category_name_frac
-- count fraction price_x_city__param_1_frac
-- count fraction price_x_city__param_2_frac
-- count fraction price_x_city__image_top_1_frac
-- count fraction price_x_image_top_1__category_name_frac
-- count fraction price_x_image_top_1__param_1_frac
-- count fraction price_x_image_top_1__param_2_frac
-- count fraction price_x_population_groups__param_1_frac
-- combine factors: price_log_cut_x_parent_category_name
-- combine factors: price_log_cut_x_category_name
-- combine factors: price_log_cut_x_region
run validation splitting..
-- target encoding: ['region']


  return 1 / (1 + np.exp( (n - self.k) / self.f))


-- target encoding: ['region']
-- target encoding: ['city']
-- target encoding: ['city']
-- target encoding: ['parent_category_name']
-- target encoding: ['parent_category_name']
-- target encoding: ['category_name']
-- target encoding: ['category_name']
-- target encoding: ['param_1']
-- target encoding: ['param_1']
-- target encoding: ['param_2']
-- target encoding: ['param_2']
-- target encoding: ['param_3']
-- target encoding: ['param_3']
-- target encoding: ['user_type']
-- target encoding: ['user_type']
-- target encoding: ['image_top_1']
-- target encoding: ['image_top_1']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_parent_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_category_name']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_log_cut_x_region']
-- target encoding: ['price_exists']
-- target encoding: ['price_exists']
-- target encod

In [3]:
def preprocessing_valid(texts, max_words, maxlen):
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=maxlen)  
    indices = np.arange(data.shape[0])
    
    # shuffle
    np.random.seed(10101)
    np.random.shuffle(indices)
    data = data[indices]
    
    training_samples = n_train - 400000
    validation_samples = 300000
    holdout_samples = 100000

    X = {}
    X['train'] = data[:training_samples]
    X['valid'] = data[training_samples : training_samples + validation_samples]
    X['holdo'] = data[training_samples + validation_samples :]
    
    return X, word_index

def preprocessing_fulltrain(texts, max_words, maxlen):
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=maxlen)  
    
    return data[:n_train], data[n_train:], word_index

In [4]:
# df_train = pd.read_csv('../input/train.csv')
# labels = df_train['deal_probability'].values.squeeze()
n_train = X.shape[0]

# del df_train
# gc.collect()

In [5]:
with open('../input/text_features_lemm.pkl', 'rb') as f: 
    df_text = pickle.load(f)
texts = df_text.description.tolist()
len(texts)

2011862

In [6]:
X_text, X_test_text = texts[:n_train], texts[:n_train]

maxlen = 50
max_words = 1000

X_text, word_index = preprocessing_valid(X_text, max_words, maxlen)
print('done')
X_text['fulltrain'], X_text['test'], word_index_full = preprocessing_fulltrain(texts, max_words, maxlen)
print('done')

X_text['train'].shape, X_text['valid'].shape, X_text['fulltrain'].shape

done
done


((1103424, 60), (300000, 60), (1503424, 60))

In [7]:
embedding_index = {}
f = open('../input/wiki.ru.vec', encoding='utf-8')
for i, line in enumerate(f):
    if i == 0: continue
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    except ValueError:
        pass
f.close()
len(embedding_index.keys())

1776030

In [8]:
embedding_dim = 300
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [9]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))
       
# def nn_simple():
    
#     model = Sequential()
#     model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
#     model.add(Flatten())
#     model.add(Dense(32, activation='relu'))
#     model.add(Dense(1, activation=None))
#     return model

       
# def nn_gru_simple():
    
#     model = Sequential()
#     model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
#     model.add(CuDNNGRU(128))
#     model.add(Dropout(0.2))
#     model.add(Dense(1, activation=None))
#     return model


# def nn_gru_simple():
    
#     model = Sequential()
#     model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
#     model.add(CuDNNGRU(128))
#     model.add(Dropout(0.2))
#     model.add(Dense(1, activation=None))
#     return model

# def nn_lstm_simple():
    
#     input_text = Input(shape=(maxlen,))
#     x = Embedding(max_words, embedding_dim, input_length=maxlen)(input_text)
#     x = CuDNNLSTM(64, return_sequences=True)(x)
#     x = Dropout(0.1)(x)
#     x = CuDNNLSTM(64)(x)
#     x = Dropout(0.1)(x)
#     x = Dense(1, activation=None)(x)
#     regr_proba = Dense(1, activation=None)(x)
#     model = Model(inputs=[input_text], outputs=[regr_proba])
#     model.layers[1].set_weights([embedding_matrix])
#     model.layers[1].trainable = False    
#     model.summary()
#     return model

In [10]:
# model = nn_lstm_simple()
# model.compile(optimizer='adam', loss=root_mean_squared_error)
# history = model.fit(X['train'], y['train'], epochs=1, batch_size=256, 
#                     validation_data=(X['valid'], y['valid']))

In [11]:
from keras import regularizers
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D

In [12]:
def nn_lstm_conv():
    
    # numeric
    input_num = Input(shape=(58,))
    num = Embedding(58, 100)(input_num)
    num = GlobalMaxPooling1D()(num)
    #num = Flatten()(num)
    # text
    input_text = Input(shape=(maxlen,))
    x = Embedding(max_words, embedding_dim, input_length=maxlen, 
                  weights=[embedding_matrix], 
                  trainable=False
                 )(input_text)
    x = CuDNNLSTM(64, return_sequences=True)(x)
    x = Conv1D(32, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(32, 5, activation='relu')(x)
    x_max = GlobalMaxPooling1D()(x)
    x_avg = GlobalAveragePooling1D()(x)
    x = concatenate([num, x_max, x_avg])
    x = Dense(1, activation=None)(x)
    regr_proba = Dense(1, activation=None)(x)
    model = Model(inputs=[input_num, input_text], outputs=[regr_proba]) 
    model.summary()
    return model

def nn_lstm_num():
    
    # numeric
    input_num = Input(shape=(58,))
    num = Embedding(58, 20)(input_num)
    num = Flatten()(num)
    num = Dropout(0.2)(num)
    # text
    input_text = Input(shape=(maxlen,))
    x = Embedding(max_words, embedding_dim, input_length=maxlen, 
                  weights=[embedding_matrix], 
                  trainable=False
                 )(input_text)
    x = Bidirectional(CuDNNLSTM(32))(x)
    x = concatenate([num, x])
    x = Dense(16, activation='tanh')(x)
    x = Dropout(0.2)(x)
    x = Dense(1, activation=None)(x)
    regr_proba = Dense(1, activation=None)(x)
    model = Model(inputs=[input_num, input_text], outputs=[regr_proba]) 
    model.summary()
    return model

def nn_lstm_top():
    
    # numeric
    input_num = Input(shape=(20,))
    num = Embedding(20, 20)(input_num)
    num = Flatten()(num)
    num = Dropout(0.2)(num)
    # text
    input_text = Input(shape=(maxlen,))
    x = Embedding(max_words, embedding_dim, input_length=maxlen, 
                  weights=[embedding_matrix], 
                  trainable=False
                 )(input_text)
    x = CuDNNLSTM(32)(x)
    x = concatenate([num, x])
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(1, activation=None)(x)
    regr_proba = Dense(1, activation=None)(x)
    model = Model(inputs=[input_num, input_text], outputs=[regr_proba]) 
    model.summary()
    return model

def nn_lstm_conv2():
    
    # numeric
    input_num = Input(shape=(58,))
    num = Embedding(58, 10)(input_num)
    num = Flatten()(num)
    # text
    input_text = Input(shape=(maxlen,))
    x = Embedding(max_words, embedding_dim, input_length=maxlen, 
                  weights=[embedding_matrix], 
                  trainable=False
                 )(input_text)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Conv1D(64, 2, activation='relu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(64, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(64, 4, activation='relu')(x)
    x_max = GlobalMaxPooling1D()(x)
    x_avg = GlobalAveragePooling1D()(x)
    x = concatenate([num, x_max, x_avg])
    x = Dense(64, activation='relu')(x)
    x = Dense(1, activation=None)(x)
    regr_proba = Dense(1, activation=None)(x)
    model = Model(inputs=[input_num, input_text], outputs=[regr_proba]) 
#     model.summary()
    return model

model = nn_lstm_conv2()
gc.collect()

16

In [13]:
# x_train.columns
# top_num = ['item_seq_number', 'salaries', 'population', 'price_log_cut',
#        'isn_log_cut', 'region_mean', 'city_mean', 'parent_category_name_mean',
#        'category_name_mean', 'param_1_mean', 'param_2_mean', 'param_3_mean',
#        'user_type_mean', 'image_top_1_mean',
#        'price_log_cut_x_parent_category_name_mean',
#        'price_log_cut_x_category_name_mean', 'price_log_cut_x_region_mean',
#        'population_groups_mean',
#        'sgd', 'ext']

# x_train.shape, X_text['train'].shape, y_train.shape

In [14]:
# model.load_weights("../input/conv_w.h5")
# print(rmse(y_valid, model.predict([x_valid, X_text['valid']])))
# print(rmse(y_holdout, model.predict([x_holdout, X_text['holdo']])))

In [15]:
# model.compile(optimizer='adam', loss=root_mean_squared_error)
# history = model.fit([x_train, X_text['train']], y_train,
#                     epochs=1, batch_size=512,
#                     validation_data=([x_valid, X_text['valid']], y_valid))

# print(rmse(y_valid, model.predict([x_valid, X_text['valid']])))
# print(rmse(y_holdout, model.predict([x_holdout, X_text['holdo']])))

In [16]:
# model.save_weights('../input/conv_w.h5')

In [17]:
# model.save('../input/conv_model.h5')

In [18]:
# del model
# gc.collect()

In [19]:
# a = np.zeros(10)
# idx = [1,2,3]
# b = np.ones((3,1))
# a[idx] = b.squeeze()

In [20]:
def oof_prediction_keras(nn, data, text_dict, y, nfolds=4):
    
    train_pred = np.zeros(data[0].shape[0])
    valid_pred = np.zeros(data[1].shape[0])
    errors = np.zeros(nfolds)
    if len(data) == 3: holdo_pred = np.zeros(data[2].shape[0])
    
    for i, (train_idx, test_idx) in enumerate( KFold(nfolds).split(data[0]) ):
        X_train, X_test = data[0][train_idx], data[0][test_idx]
        X_train_text, X_test_text = text_dict['train'][train_idx], text_dict['train'][test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # fit
        model = nn()
        #model.load_weights("../input/conv_w.h5")
        model.compile(optimizer='adam', loss=root_mean_squared_error)
        model.fit([X_train, X_train_text], y_train, epochs=3, batch_size=512)
        
        # predict 
        test_pred = model.predict([X_test, X_test_text])
        train_pred[test_idx] = test_pred.squeeze()
        valid_pred += model.predict([data[1], text_dict['valid']]).squeeze()
        if len(data) == 3: holdo_pred += model.predict([data[2], text_dict['holdo']]).squeeze()
        errors[i] = rmse(y_test, test_pred)
    
    print("{:.5f}+-{:.5f}".format(errors.mean(), errors.std()))
    
    valid_pred /= nfolds
    if len(data) == 3: 
        holdo_pred /= nfolds    
        return [train_pred, valid_pred, holdo_pred]
    else:
        return [train_pred, valid_pred]

In [21]:
data = [x_train.values, x_valid.values, x_holdout.values]
preds = oof_prediction_keras(nn_lstm_conv2, data, X_text, y_train)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.22844+-0.00075


In [22]:
def oof_prediction_keras(nn, data, text_data, y, nfolds=4):
    
    train_pred = np.zeros(data[0].shape[0])
    valid_pred = np.zeros(data[1].shape[0])
    errors = np.zeros(nfolds)
    
    for i, (train_idx, test_idx) in enumerate( KFold(nfolds).split(data[0]) ):
        X_train, X_test = data[0][train_idx], data[0][test_idx]
        X_train_text, X_test_text = text_data[0][train_idx], text_data[0][test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # fit
        model = nn()
        #model.load_weights("../input/conv_w.h5")
        model.compile(optimizer='adam', loss=root_mean_squared_error)
        model.fit([X_train, X_train_text], y_train, epochs=3, batch_size=512)
        
        # predict 
        test_pred = model.predict([X_test, X_test_text])
        train_pred[test_idx] = test_pred.squeeze()
        valid_pred += model.predict([data[1], text_data[1]]).squeeze()
        errors[i] = rmse(y_test, test_pred)
    
    print("{:.5f}+-{:.5f}".format(errors.mean(), errors.std()))
    
    valid_pred /= nfolds
    return [train_pred, valid_pred]

embedding_dim = 300
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index_full.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [23]:
'done'

'done'

In [24]:
# test
data = [X.values, X_test.values]
preds += oof_prediction_keras(nn_lstm_conv2, data, [X_text['fulltrain'], X_text['test']], y)
print('done')

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.22911+-0.00146
done


In [25]:
d_preds = {}
for pred, k in zip(preds, ['train', 'valid', 'holdout', 'fulltrain', 'test']):
    d_preds[k] = pred
    
with open('../fe/rnn2.pkl', 'wb') as file: pickle.dump(file=file, obj=d_preds)

In [26]:
# # save for blending 
# blending = {}
# blending['valid'] = model.predict(x_valid).clip(0, 1)
# blending['holdout'] = model.predict(x_holdout).clip(0, 1)

# # TODO model fit full data
# blending['test'] = full_train_model.predict(X_test).clip(0, 1)
# with open('../blending/lg1.pkl', 'wb') as f: pickle.dump(obj=blending, file=f)