In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os, gc
import seaborn as sns

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', 
        font_scale=2, color_codes=True, rc=None)

In [2]:
train_test = pd.read_csv('data/train_test.csv')
sub_df = pd.read_csv('data/sample_submission.csv')

train_test.shape, sub_df.shape

((488189, 62), (146765, 2))

In [3]:
train_len = 341424

target = train_test.amount_spent_per_room_night_scaled
memberids = train_test.memberid
target = target[:train_len]
memberids = memberids[:train_len]

target.shape, memberids.shape

((341424,), (341424,))

In [4]:
from sklearn.model_selection import GroupKFold
group_kfold1 = GroupKFold(n_splits=5)
for train_index, test_index in group_kfold1.split(train_test.iloc[:train_len], target, memberids):
    break

y_tr, y_val = target[train_index], target[test_index]
        
train_index.shape, test_index.shape, y_tr.shape, y_val.shape

((273139,), (68285,), (273139,), (68285,))

In [5]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

def model_lgb(X_train, y_train, X_valid, y_valid, test, features):
    print(X_train.shape) 
    def train_lgb_model(f_frac, b_frac, 
                        l1, l2, split_gain,
                        leaves, data_in_leaf, hessian):
    
        param = {}

        param['feature_fraction'] = max(min(f_frac, 1), 0)
        param['bagging_fraction'] = max(min(b_frac, 1), 0)

        param['lambda_l1'] = max(l1, 0)
        param['lambda_l2'] = max(l2, 0)
        param['min_split_gain'] = split_gain
#     #     params['min_child_weight'] = min_child_weight

        param['num_leaves'] = int(leaves)
        param['min_data_in_leaf'] = int(data_in_leaf)
        param['min_sum_hessian_in_leaf'] = max(hessian, 0)

        param_const = {
            'max_bins'               : 63,
            'learning_rate'          : 0.01,
            'num_threads'            : 4,
            'metric'                 : 'rmse',
            'boost'                  : 'gbdt',
            'tree_learner'           : 'serial',
            'objective'              : 'root_mean_squared_error',
            'verbosity'              : 1,
        }

        for key, item in param_const.items():
            param[key] = item
    
#         print(param)

        _train = lgb.Dataset(X_train[features], label=y_train, feature_name=list(features))
        _valid = lgb.Dataset(X_valid[features], label=y_valid,feature_name=list(features))

        clf = lgb.train(param, _train, 10000, 
                        valid_sets = [_train, _valid], 
                        verbose_eval=200, 
                        early_stopping_rounds = 25)                  

        oof = clf.predict(X_valid[features], num_iteration=clf.best_iteration)
        score = mean_squared_error(y_valid, oof)
        print("rmse: ", np.sqrt(score)*100)
        
        pred = clf.predict(test[features], num_iteration=clf.best_iteration)
        
        return clf, oof, pred

    clf, oof, pred = train_lgb_model(**{
        'b_frac'       : 0.3677516358370858,
        'data_in_leaf' : 495.44417416221626,
        'f_frac'       : 0.5422060360159515,
        'hessian'      : 5.039378213231793,
        'l1'           : 1.0642598045225304,
        'l2'           : 2.564544963055539,
        'leaves'       : 89.62655396835916,
        'split_gain'   : 0.16542750189034394
    })
    
    return clf, oof, pred

In [6]:
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error

def model_cat(X_train, y_train, X_valid, y_valid, test, features):
    print(X_train.shape)
    def train_cat_model(r_str, b_temp, l2, depth):
    
        params = {}
        params['random_strength']     = max(min(r_str, 1), 0)
        params['bagging_temperature'] = max(b_temp, 0)
        params['l2_leaf_reg'] = max(l2, 0)
        params['depth']     = int(depth)

        param_const = {
            'border_count'          : 63,
            'early_stopping_rounds' : 50,
            'random_seed'           : 1337,
            'task_type'             : 'CPU', 
            'loss_function'         : "RMSE", 
    #         'subsample'             = 0.7, 
            'iterations'            : 10000, 
            'learning_rate'         : 0.01,
            'thread_count'          : 4,
#             'bootstrap_type'        : 'No'
        }

        for key, item in param_const.items():
            params[key] = item
    
        

        _train = Pool(X_train[features], label=y_train)#, cat_features=cate_features_index)
        _valid = Pool(X_valid[features], label=y_valid)#, cat_features=cate_features_index)

        watchlist = [_train, _valid]
        clf = CatBoostRegressor(**params)
        clf.fit(_train, 
                eval_set=watchlist, 
                verbose=500,
                use_best_model=True)

        oof  = clf.predict(X_valid[features])
        score = mean_squared_error(y_valid, oof)
        print("rmse: ", np.sqrt(score)*100)
        
        pred = clf.predict(test[features])
        
        return clf, oof, pred

    clf, oof, pred = train_cat_model(**{
        'b_temp': 0.08307474720468191,
        'depth' : 7.596402546589758,
        'l2'    : 3.9791105400066655,
        'r_str' : 1.1206250787323229
    })
    
    return clf, oof, pred



In [7]:
cols_to_drop= ['res_staResidence_median','tr_flag','reservationstatusid_code',
               'res_resortType_median','res_stResort_median','res_cluster_median']
train_test.drop(cols_to_drop, axis=1, inplace=True)

for col in train_test.columns:
    if col != 'amount_spent_per_room_night_scaled':
        train_test[col] = train_test[col].astype('category').cat.codes
        
train_test.drop('amount_spent_per_room_night_scaled', axis=1, inplace=True)
train_test.drop(['booking_date','checkin_date','checkout_date','memberid'], axis=1, inplace=True)


In [8]:
train_ = train_test.iloc[train_index]
valid_ = train_test.iloc[test_index]
test_df = train_test.iloc[train_len:].reset_index(drop=True)

train_.shape, valid_.shape, test_df.shape

((273139, 51), (68285, 51), (146765, 51))

In [9]:
clf1, oof1, pred1 = model_lgb(train_, y_tr, valid_, y_val, test_df, train_.columns)

(273139, 51)
Training until validation scores don't improve for 25 rounds.
[200]	training's rmse: 0.988628	valid_1's rmse: 0.993064
[400]	training's rmse: 0.970306	valid_1's rmse: 0.979044
[600]	training's rmse: 0.962145	valid_1's rmse: 0.974981
[800]	training's rmse: 0.956503	valid_1's rmse: 0.973268
[1000]	training's rmse: 0.951924	valid_1's rmse: 0.972372
[1200]	training's rmse: 0.948081	valid_1's rmse: 0.971955
[1400]	training's rmse: 0.944721	valid_1's rmse: 0.971741
Early stopping, best iteration is:
[1547]	training's rmse: 0.94235	valid_1's rmse: 0.971603
rmse:  97.16025150314397


In [10]:
clf2, oof2, pred2 = model_cat(train_, y_tr, valid_, y_val, test_df, train_.columns)
# rmse:  97.10604730539535

(273139, 51)
0:	learn: 7.7196219	test: 7.7196219	test1: 7.7263838	best: 7.7263838 (0)	total: 86.8ms	remaining: 14m 28s
500:	learn: 0.9884650	test: 0.9884650	test1: 0.9912968	best: 0.9912968 (500)	total: 25.2s	remaining: 7m 58s
1000:	learn: 0.9742311	test: 0.9742311	test1: 0.9790774	best: 0.9790774 (1000)	total: 50.4s	remaining: 7m 32s
1500:	learn: 0.9686144	test: 0.9686144	test1: 0.9758477	best: 0.9758477 (1500)	total: 1m 18s	remaining: 7m 24s
2000:	learn: 0.9644463	test: 0.9644463	test1: 0.9742194	best: 0.9742194 (2000)	total: 1m 42s	remaining: 6m 49s
2500:	learn: 0.9608381	test: 0.9608381	test1: 0.9733244	best: 0.9733244 (2500)	total: 2m 7s	remaining: 6m 20s
3000:	learn: 0.9575685	test: 0.9575685	test1: 0.9727127	best: 0.9727127 (3000)	total: 2m 31s	remaining: 5m 53s
3500:	learn: 0.9546080	test: 0.9546080	test1: 0.9723188	best: 0.9723186 (3499)	total: 2m 55s	remaining: 5m 25s
4000:	learn: 0.9518666	test: 0.9518666	test1: 0.9720984	best: 0.9720984 (4000)	total: 3m 19s	remaining: 4m 58

In [11]:
gc.collect()

60

In [15]:
pred_all = np.column_stack([pred1, pred2])
oof_all  = np.column_stack([oof1, oof2])

pred_all = pd.DataFrame(data=pred_all, columns=['pred1','pred2'])
oof_all = pd.DataFrame(data=oof_all, columns=['oof1','oof2'])

print("Combined", 100*np.sqrt(mean_squared_error(oof_all.mean(axis=1), y_val)))

sub_df_combine = sub_df.copy()
sub_df_combine.amount_spent_per_room_night_scaled = pred_all.mean(axis=1)

sub_df_combine.to_csv('final_submissions.csv',index=None)


print("================")
print("Saved Prediction")
print("================")


Combined 97.11157945200361
Saved Prediction
