In [1]:
import pandas as pd
import numpy as np 
import lightgbm as lg

import gc 
#from copy import deepcopy
import pickle
import warnings
warnings.filterwarnings('ignore')

from avito_functions import *
from avito_classes import TargetEncoder

#from sklearn.model_selection import KFold
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, load_npz
from scipy import sparse
from nltk.corpus import stopwords
from sklearn.preprocessing import OneHotEncoder
import scipy 

import time 
import datetime
%matplotlib inline

import itertools

In [2]:
new_features = pd.read_csv('../input/new_feautures.csv').iloc[:, 1:]
# new_features.head()

In [3]:
# input 

data_keys = ['train', 'valid', 'holdout', 'fulltrain', 'test']

print('Load df')
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

print('Load agg input')
with open('../input/map_dict.pkl', 'rb') as file: map_dict = pickle.load(file)
#with open('../input/text_features.pkl', 'rb') as f: X_text = pickle.load(f)
# with open('../input/tfidf_1.pkl', 'rb') as file: tfidf_dict = pickle.load(file)

# with open('../input/text_num_features_clean.pkl', 'rb') as f: X_text_num = pickle.load(f)
with open('../input/text_num_features_lemm.pkl', 'rb') as f: X_text_num = pickle.load(f)

Load df
Load agg input


In [119]:
sgd = load_fe('sgd2')
ext = load_fe('extra')
# rnn = load_fe('rnn')

In [120]:
def preprocessing(df_train, df_test, map_dict, add_features=None):

    print('run preprocessing..')
    
    target = 'deal_probability'

    # get labels, merge 
    y = df_train[target].values.squeeze()
    X = df_train.drop([target], 1).append(df_test)
    X.index = np.arange(X.shape[0])

    # map additional information
    X['salaries'] = X.region.map(map_dict['salaries'])
    X['population'] = X.city.map(map_dict['population'])

    # merge additional features
    if not add_features is None:
        X = pd.concat([X, add_features], 1)
    
    # drop useless features 
    X = X.drop(['title', 'item_id'], 1)
   
    category_features = ['region', 'city', 
                         'parent_category_name', 'category_name', 
                         'param_1', 'param_2', 'param_3', 
                         'user_type', 'image_top_1', 'user_id']

    return X, y, category_features


def feature_engineering(X, category_features, factorize=False, price_bins=10):
    
    print('run feature engineering..')
    new_factors = []
    
    # numeric transformations 
    X['user_type_num'] = pd.factorize(X['user_type'])[0]
    X['price_log'] = np.log(X.price+0.0001)
    X['population_log'] = np.log(X.population+0.0001)
    X['isn_log'] = np.log(X.item_seq_number+0.0001)
    X['price_log_div_salaries'] = X['price_log'] / X['salaries']
    
    # bool
    X['price_exists'] = (~X.price.isnull()).astype(int)
    X['image_exists'] = (~X.image.isnull()).astype(int)
    X['descr_exists'] = (~X.description.isnull()).astype(int)
    
    X['population_groups'] = \
    ((X.population >= 1000000) * 1 + \
    ((X.population >= 500000) & (X.population < 1000000)) * 2 + \
    ((X.population >= 100000) & (X.population < 500000)) * 3 + \
    (X.population < 100000) * 4 - 1)     
    
    # date
    dt = pd.to_datetime(X.activation_date)
    X['weekday'] = dt.dt.weekday
    X['free_day'] = (dt.dt.dayofweek > 5).astype(int)  
    
    # groups, numeric 
    X = count_group_frac(X, 'price_log', ['region', 'category_name'])
    X = count_group_frac(X, 'price_log', ['region', 'param_1'])    
    X = count_group_frac(X, 'price_log', ['population_groups', 'category_name'])
    X = count_group_frac(X, 'price_log', ['population_groups', 'param_1'])
    X = count_group_frac(X, 'price_log', ['city', 'category_name'])
    X = count_group_frac(X, 'price_log', ['city', 'param_1'])
    
    # cutting
    X['price_log_cut'] = pd.cut(X['price_log'], bins=price_bins).cat.codes
    X['isn_log_cut'] = pd.cut(pd.Series(np.log(X.item_seq_number+0.0001)), 7).cat.codes

    # features
    new_factors += ['price_exists', 'image_exists', 'descr_exists']
    new_factors += ['weekday', 'free_day']
    
    X.drop(['activation_date', 'image', 'description'], axis=1, inplace=True)
    X.drop(['price', 'item_seq_number', 'population'], axis=1, inplace=True)
    
    category_features += new_factors
    category_features += ['population_groups']

    if factorize==True:
        for f in category_features:
            X[f] = pd.factorize(X[f])[0]
    if factorize=='pos':
        for f in category_features:
            X, _ = factorize_p(X, f)
    
    return X, category_features


In [125]:
add_features = pd.concat([X_text_num, new_features], axis=1)

In [127]:
# pipeline
n_train = df_train.shape[0]
X, y, category_features = preprocessing(df_train, df_test, map_dict, add_features)
category_features += ['Time_zone']
X, category_features = feature_engineering(X, category_features, factorize=True)

run preprocessing..
run feature engineering..
-- count fraction price_log_x_region__category_name_frac
-- count fraction price_log_x_region__param_1_frac
-- count fraction price_log_x_population_groups__category_name_frac
-- count fraction price_log_x_population_groups__param_1_frac
-- count fraction price_log_x_city__category_name_frac
-- count fraction price_log_x_city__param_1_frac


In [131]:
# X.salaries.hist()

In [132]:
# one hot encoding fit
# oh_encoder = OneHotEncoder(handle_unknown='ignore')
# oh_encoder.fit(X[category_features])

In [133]:
# split
X, X_test = X[:n_train], X[n_train:]

x_train, x_valid, x_holdout, \
y_train, y_valid, y_holdout, \
_, _, _ = validation_split(X, y)
    
del df_train, df_test
gc.collect()

run validation splitting..


1630

In [134]:
# # target encoding 
# te_groups = []
# te_groups += [[f] for f in category_features]

# te_groups += [['price_log_cut', 'category_name'], 
#               ['price_log_cut', 'region'],
#               ['price_log_cut', 'param_1'],
#               ['region', 'parent_category_name']
#              ]

# for group in te_groups:
#     x_train, x_valid, x_holdout = target_encoding(x_train, y_train, x_valid, group, x_holdout)
#     X, X_test = target_encoding(X, y, X_test, group)

In [135]:
# add artificial features
for x, k in zip([x_train, x_valid, x_holdout, X, X_test], data_keys):
    x['sgd'] = sgd[k]
    x['ext'] = ext[k]
#     x['rnn'] = rnn[k]

In [136]:
# def num_scaling(x_train, x_valid, x_holdout=None, num=None, mode='z', copy=True):
    
#     if mode=='z':
#         scaler = StandardScaler(copy=copy)
#     elif mode=='norm':
#         scaler = MinMaxScaler(feature_range=(0,1))
#     else:
#         pass
    
#     if num is None:
#         num = x_train.columns.tolist()
    
#     # fit 
#     x_train[num] = scaler.fit_transform(x_train[num])
#     # transform valid 
#     x_valid[num] = scaler.transform(x_valid[num])
    
#     # transform holdout
#     if x_holdout is None:
#         return x_train, x_valid, scaler
#     else:
#         x_holdout[num] = scaler.transform(x_holdout[num])
#         return x_train, x_valid, x_holdout, scaler

In [137]:
# num_ft = list(set(x_train.columns) - set(category_features))

# ## impute 
# print('impute numeric')
# x_train, x_valid, x_holdout, _ = num_fillna(x_train, x_valid, x_holdout, num=num_ft)
# X, X_test, _ = num_fillna(X, X_test, num=num_ft)

# ## scale
# print('scale numeric')
# x_train, x_valid, x_holdout, _ = num_scaling(x_train, x_valid, x_holdout, num=num_ft)
# X, X_test, _ = num_scaling(X, X_test, num=num_ft)

In [138]:
def num_auto_fe(X, fe_list):
    """
    X -- pandas dataframe
    """
    for f1, f2 in itertools.combinations(fe_list, 2):
        X[f1+'_x_'+f2] = X[f1] * X[f2]
        X[f1+'_div_'+f2] = X[f1] / X[f2]
    for f in fe_list:
        X['p2_'+f] = np.power(X[f],2)
        
    return X

num_fe = ['salaries', 'price_log', 'population_log', 'isn_log']

x_train = num_auto_fe(x_train, num_fe)
x_valid = num_auto_fe(x_valid, num_fe)
x_holdout = num_auto_fe(x_holdout, num_fe)
X = num_auto_fe(X, num_fe)
X_test = num_auto_fe(X_test, num_fe)

In [139]:
del x_holdout
gc.collect()

210

In [140]:
# def proj_num_on_cat(train_df, test_df, target_column, group_column):
#     """
#     :param train_df: train data frame
#     :param test_df:  test data frame
#     :param target_column: name of numerical feature
#     :param group_column: name of categorical feature
#     """
#     train_df['row_id'] = range(train_df.shape[0])
#     test_df['row_id'] = range(test_df.shape[0])
#     train_df['train'] = 1
#     test_df['train'] = 0
#     all_df = train_df[['row_id', 'train', target_column, group_column]].append(test_df[['row_id','train',
#                                                                                         target_column, group_column]])
#     grouped = all_df[[target_column, group_column]].groupby(group_column)
#     the_size = pd.DataFrame(grouped.size()).reset_index()
#     the_size.columns = [group_column, '%s_size' % target_column]
#     the_mean = pd.DataFrame(grouped.mean()).reset_index()
#     the_mean.columns = [group_column, '%s_mean' % target_column]
#     the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
#     the_std.columns = [group_column, '%s_std' % target_column]
#     the_median = pd.DataFrame(grouped.median()).reset_index()
#     the_median.columns = [group_column, '%s_median' % target_column]
#     the_stats = pd.merge(the_size, the_mean)
#     the_stats = pd.merge(the_stats, the_std)
#     the_stats = pd.merge(the_stats, the_median)

#     the_max = pd.DataFrame(grouped.max()).reset_index()
#     the_max.columns = [group_column, '%s_max' % target_column]
#     the_min = pd.DataFrame(grouped.min()).reset_index()
#     the_min.columns = [group_column, '%s_min' % target_column]

#     the_stats = pd.merge(the_stats, the_max)
#     the_stats = pd.merge(the_stats, the_min)

#     all_df = pd.merge(all_df, the_stats, how='left')

#     selected_train = all_df[all_df['train'] == 1]
#     selected_test = all_df[all_df['train'] == 0]
#     selected_train.sort_values('row_id', inplace=True)
#     selected_test.sort_values('row_id', inplace=True)
#     selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
#     selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)

#     selected_train, selected_test = np.array(selected_train), np.array(selected_test)
#     #print(selected_train.shape, selected_test.shape)
#     return selected_train, selected_test

# proj_num = ['salaries', 'price_log', 'population', 'isn_log', 'sgd', 'ext']
# proj_cat = ['image_top_1', 'city', 'param_1', 'region']

# # feature aggregation valid
# train_list, test_list = [], []
# for t in proj_num:
#     for g in proj_cat:
#         if t != g:
#             print('.', end='')
#             s_train, s_test = proj_num_on_cat(x_train, x_valid, target_column=t, group_column=g)
#             train_list.append(s_train)
#             test_list.append(s_test)
            
# train_proj = csr_matrix(np.hstack(train_list))
# valid_proj = csr_matrix(np.hstack(test_list))
# print('valid done')    

# # feature aggregation full
# train_list, test_list = [], []
# for t in proj_num:
#     for g in proj_cat:
#         if t != g:
#             print('.', end='')
#             s_train, s_test = proj_num_on_cat(X, X_test, target_column=t, group_column=g)
#             train_list.append(s_train)
#             test_list.append(s_test)
            
# fulltrain_proj = csr_matrix(np.hstack(train_list))
# test_proj = csr_matrix(np.hstack(test_list))
# print('fulltrain done')

In [141]:
# for x in [x_train, x_valid, X, X_test]:
#     x.drop(['row_id', 'train'], 1, inplace=True)

# print(x_train.shape, x_valid.shape, X.shape, X_test.shape)

# cols = x_train.columns.tolist() + ['proj'+str(i) for i in range(1, train_proj.shape[1]+1)]
cols = x_train.columns.tolist()
print(len(cols))
# train_proj, valid_proj, fulltrain_proj, test_proj

68


In [142]:
# x_train = hstack([x_train.values, train_proj])
# x_valid = hstack([x_valid.values, valid_proj])
# X = hstack([X.values, fulltrain_proj])
# X_test = hstack([X_test.values, test_proj])

print(x_train.shape, x_valid.shape, X.shape, X_test.shape)

(1103424, 68) (300000, 68) (1503424, 68) (508438, 68)


TFIDF

In [143]:
# russian_stop = set(stopwords.words('russian'))

# def get_col(col_name): return lambda x: x[col_name]
# def tfidf_pipeline(max_features=[20000, 10000, 10000], 
#                    min_df=10, max_df=.9, sub_tf=True, smooth_idf=False,
#                    ngram_range = (1, 2), stop=russian_stop
#                   ):

#     tfidf_param = {
#         "stop_words": stop,
#         "analyzer": 'word',
#         "token_pattern": r'\w{1,}',
#         "sublinear_tf": sub_tf,
#         "dtype": np.float32,
#         "norm": 'l2',
#         "min_df": min_df,
#         "max_df": max_df,
#         "smooth_idf": smooth_idf
#     }
    
#     vectorizer = FeatureUnion([
#             ('description',TfidfVectorizer(
#                 ngram_range=ngram_range,
#                 max_features=max_features[0],
#                 **tfidf_param,
#                 preprocessor=get_col('description'))),
#             ('text_feat',CountVectorizer(
#                 ngram_range=ngram_range,
#                 max_features=max_features[1],
#                 min_df=min_df,
#                 preprocessor=get_col('text_feat'))),
#             ('title',TfidfVectorizer(
#                 ngram_range=ngram_range,
#                 max_features=max_features[2],
#                 **tfidf_param,
#                 preprocessor=get_col('title')))
#         ])
    
#     # tfidf validation
#     start_vect=time.time()

#     train = vectorizer.fit_transform(x_train_text.to_dict('records'))
#     val = vectorizer.transform(x_valid_text.to_dict('records'))
#     tfvocab = vectorizer.get_feature_names()

#     print("Train shape:", train.shape)
#     print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))
#     start_vect=time.time()

#     cols = x_train.columns.tolist() + tfvocab
#     train = hstack([csr_matrix(x_train.values), 
#                                  train])
#     valid = hstack([csr_matrix(x_valid.values), 
#                                  val])
#     ds_train = lg.Dataset(train, y_train, feature_name=cols, categorical_feature=category_features)
#     ds_valid = lg.Dataset(valid, y_valid, feature_name=cols, categorical_feature=category_features)

#     model, evals_results = lg_train(lg_params, ds_train, ds_valid, 1000, 50,
#                                     verbose_eval=1000)

#     print("Train Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))
#     return evals_results['valid']['rmse'][model.best_iteration-1]

# lg_params = {   
#     'nthread': 4,
#     'objective': 'regression',
#     'metric': 'rmse',
#     'learning_rate': 0.2,
#     'num_leaves': 200, 
#     'subsample': 0.75, 
#     'colsample_bytree': 0.6,
#     'min_child_weight': 20,    
# }

# # TEXT pipeline

# with open('../input/text_features_clean.pkl', 'rb') as f: X_text = pickle.load(f)
# x_train_text, x_valid_text, x_holdout_text, \
# _, _, _, \
# _, _, _ = validation_split(X_text[:X.shape[0]], y)

# grid_params = [
#     {'min_df': 10, 'max_features': [100000, None, None]},
# ]
# grid_error = []
# for param in grid_params:
#     print("-- param:", param)
#     error = tfidf_pipeline(**param)
#     grid_error.append((param, error))
#     print("error:", error)

In [144]:
# grid_error

^ TFIDF grid ^

In [145]:


# stop = set(stopwords.words('russian'))
# min_df = 10
# max_df = .9
# smooth_idf = False
# sub_tf = True
# ngram_range = (1,2)
# max_features = [100000, None, None]

# tfidf_param = {
#     "stop_words": stop,
#     "analyzer": 'word',
#     "token_pattern": r'\w{1,}',
#     "sublinear_tf": sub_tf,
#     "dtype": np.float32,
#     "norm": 'l2',
#     "min_df": min_df,
#     "max_df": max_df,
#     "smooth_idf": smooth_idf
# }

# def get_col(col_name): return lambda x: x[col_name]

# vectorizer = FeatureUnion([
#         ('description',TfidfVectorizer(
#             ngram_range=ngram_range,
#             max_features=max_features[0],
#             **tfidf_param,
#             preprocessor=get_col('description'))),
#         ('text_feat',CountVectorizer(
#             ngram_range=ngram_range,
#             max_features=max_features[1],
#             min_df=min_df,
#             preprocessor=get_col('text_feat'))),
#         ('title',TfidfVectorizer(
#             ngram_range=ngram_range,
#             max_features=max_features[2],
#             **tfidf_param,
#             preprocessor=get_col('title')))
#     ])

In [146]:
# with open('../input/text_features_clean.pkl', 'rb') as f: X_text = pickle.load(f)
# with open('../input/text_features_lemm.pkl', 'rb') as f: X_text = pickle.load(f)
    
# x_train_text, x_valid_text, x_holdout_text, \
# _, _, _, \
# _, _, _ = validation_split(X_text[:X.shape[0]], y)

In [147]:
# # tfidf validation
# print('tfidf train')
# start_vect=time.time()

# x_train_text = vectorizer.fit_transform(x_train_text.to_dict('records'))
# x_valid_text = vectorizer.transform(x_valid_text.to_dict('records'))
# x_holdout_text = vectorizer.transform(x_holdout_text.to_dict('records'))
# tfvocab = vectorizer.get_feature_names()

# print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# # tfidf full train
# print('tfidf full train')
# X_text_ = vectorizer.fit_transform(X_text[:X.shape[0]].to_dict('records'))
# X_test_text = vectorizer.transform(X_text[X.shape[0]:].to_dict('records'))
# tfvocab_full = vectorizer.get_feature_names()  

# X_text = X_text_

In [148]:
# tfidf_dict = {}
# tfidf_dict['train'] = x_train_text
# tfidf_dict['valid'] = x_valid_text
# tfidf_dict['holdout'] = x_holdout_text
# tfidf_dict['tfvocab'] = tfvocab

# tfidf_dict['fulltrain'] = X_text
# tfidf_dict['test'] = X_test_text
# tfidf_dict['tfvocab_full'] = tfvocab_full

# with open('../input/tfidf_2.pkl', 'wb') as file: pickle.dump(file=file, obj=tfidf_dict)


In [149]:
# load tfidf features 
    
with open('../input/tfidf_2.pkl', 'rb') as file: tfidf_dict=pickle.load(file=file)
    
x_train_text = tfidf_dict['train']
x_valid_text = tfidf_dict['valid']
x_holdout_text = tfidf_dict['holdout']
tfvocab = tfidf_dict['tfvocab']
cols_train = cols + tfvocab

x_train = hstack([csr_matrix(x_train.values), x_train_text])
x_valid = hstack([csr_matrix(x_valid.values), x_valid_text])
x_holdout = hstack([csr_matrix(x_holdout.values), x_holdout_text])

# x_train = hstack([x_train, x_train_text])
# x_valid = hstack([x_valid, x_valid_text])
# x_holdout = hstack([x_holdout, x_holdout_text])

del x_train_text, x_valid_text, x_holdout_text

### full 

X_text = tfidf_dict['fulltrain']
X_test_text = tfidf_dict['test']
tfvocab_full = tfidf_dict['tfvocab_full']

cols_fulltrain = cols + tfvocab_full

X = hstack([csr_matrix(X.values), X_text])
X_test = hstack([csr_matrix(X_test.values), X_test_text])

# X = hstack([X, X_text])
# X_test = hstack([X_test, X_test_text])
del X_text, X_test_text

# del tfidf_dict
gc.collect()

16

TRAIN LGBM

In [150]:
# def lg_pipeline(params):    
#     num_boost_rounds = 2000
#     early_stopping_rounds = 20
#     verbose = 50
#     model, evals_results = lg_train(params, ds_train, ds_valid, 
#                              num_boost_rounds, early_stopping_rounds, verbose_eval=verbose)
#     error = evals_results['valid']['rmse'][model.best_iteration-1]
#     h_error = rmse(y_holdout, model.predict(x_holdout))
#     return float(str(np.round(error, 5))), float(str(np.round(h_error, 5)))

# lg_params = {
#     'nthread': 2,
#     'objective': 'regression',
#     'metric': 'rmse',
#     'learning_rate': 0.2,
#     'num_leaves': 127,
#     'max_depth': 0,
#     'subsample': 0.95, 
#     'bagging_freq': 1,
#     'feature_fraction': 0.4,
    
#     'min_child_weight': 10,
#     'lambda_l1': 2,
    
#     'cat_l2': 20,
#     'cat_smooth': 50,
#     'min_data_per_group': 100
# }

# grid_params = [
#     {'lambda_l1':3},
# ]

# grid_error = []
# for param in grid_params:
#     start_vect=time.time()
#     print("-- param:", param)
#     # train lgbm
#     default = lg_params.copy()
#     default.update(**param)
#     error, h_error = lg_pipeline(default)
#     runtime = float(str(np.round((time.time() - start_vect)/60, 1))) 
#     # save & print
#     grid_error.append((param, error, h_error, runtime))
#     print("  error:", error)
#     print("h error:", h_error)
#     print("Train Runtime: %0.1f Minutes"%(runtime))
    

^ GRID SEARCH LGBM ^

In [194]:
assert X.shape[1] == X_test.shape[1]
assert x_train.shape[1] == x_valid.shape[1]
# assert x_train.shape[1] == x_holdout.shape[1]
assert X_test.shape[1] == X.shape[1]
print(x_train.shape)

# category_features = []

ds_train = lg.Dataset(x_train, y_train, feature_name=cols_train, categorical_feature=category_features)
ds_valid = lg.Dataset(x_valid, y_valid, feature_name=cols_train, categorical_feature=category_features)

# ds_train = lg.Dataset(x_train, y_train, feature_name=cols)
# ds_valid = lg.Dataset(x_valid, y_valid, feature_name=cols)


(1103424, 144630)


In [195]:
gc.collect()
str(datetime.datetime.now())

'2018-06-27 17:23:05.896776'

In [196]:
def lg_train(lg_params, xgtrain, xgvalid, num_boost_round, early_stopping_rounds, callbacks=None, verbose_eval=10):
    evals_results = {}
    bst = lg.train(lg_params,                      
                   xgtrain, 
                   valid_sets=[xgtrain, xgvalid], 
                   valid_names=['train', 'valid'], 
                   evals_result=evals_results, 
                   num_boost_round=num_boost_round,
                   early_stopping_rounds=early_stopping_rounds,
                   verbose_eval=verbose_eval,
                   callbacks = callbacks
                  )
    return bst, evals_results

num_boost_rounds = 5000
# lr_decay = [0.02] * 10 + [0.2] * 90 + [0.1] * 600 + [0.05] * 1300 + [0.02] * 500
lr_decay = [0.025] * num_boost_rounds
callbacks = [lg.reset_parameter(learning_rate = lr_decay)]

In [203]:
lg_params = {
    'nthread': 4,
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.025,
    'num_leaves': 250,
    'max_depth': 0,
    'subsample': 0.9, 
    'bagging_freq': 1,
    'feature_fraction': 0.3,
    
    'min_child_weight': 10,
    'lambda_l1': 2,
    
    'cat_l2': 10,
    'cat_smooth': 50,
    'min_data_per_group': 50,
    
    'seed': 0
}

# early_stopping_rounds = 50
# model, evals_results = lg_train(lg_params, ds_train, ds_valid, num_boost_rounds, early_stopping_rounds,
#                                 verbose_eval=50, callbacks=callbacks)

# print("bst.best_iteration: ", model.best_iteration)
# print(evals_results['valid']['rmse'][model.best_iteration-1])

# str(datetime.datetime.now())

In [204]:
# print("test error:", rmse(y_holdout, model.predict(x_holdout)))
# lg.plot_importance(model, figsize=(12, 15), max_num_features=50)

In [205]:
# del ds_train, ds_valid, x_train, x_valid
# gc.collect()

In [57]:
# save for blending 
# blending = {}
# blending['valid'] = model.predict(x_valid).clip(0, 1)
# blending['holdout'] = model.predict(x_holdout).clip(0, 1)

In [206]:
# train model on full data
print(str(datetime.datetime.now()))
lr_decay = [0.025] * 2500
callbacks = [lg.reset_parameter(learning_rate = lr_decay)]

ds_train_full = lg.Dataset(X, y, feature_name=cols_fulltrain, categorical_feature=category_features)
full_train_model = lg.train(lg_params, ds_train_full, num_boost_round=2500, 
                            verbose_eval=200, valid_sets=[ds_train_full], valid_names=['train'], 
                            callbacks=callbacks
                           )
str(datetime.datetime.now())

[200]	train's rmse: 0.214871
[400]	train's rmse: 0.210102
[600]	train's rmse: 0.206792
[800]	train's rmse: 0.204097
[1000]	train's rmse: 0.201598
[1200]	train's rmse: 0.199288
[1400]	train's rmse: 0.197171
[1600]	train's rmse: 0.195104
[1800]	train's rmse: 0.19316
[2000]	train's rmse: 0.191287
[2200]	train's rmse: 0.189559
[2400]	train's rmse: 0.187877


'2018-06-27 23:24:24.909337'

In [59]:
# blending['test'] = full_train_model.predict(X_test).clip(0, 1)
# with open('../blending/lg18.pkl', 'wb') as f: pickle.dump(obj=blending, file=f)

In [207]:
sub_name = 'lg22'

import os 
if not os.path.exists('../sub'):
    os.mkdir('../sub')

df_sample = pd.read_csv("../input/sample_submission.csv")
df_sample.deal_probability = full_train_model.predict(X_test).clip(0, 1)
df_sample.to_csv('../sub/' + sub_name + '.csv', index=False)

print('submitted', sub_name)
str(datetime.datetime.now())

submitted lg22


'2018-06-27 23:35:08.607499'