In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os, gc
import seaborn as sns

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [None]:
!ls data

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df  = pd.read_csv('data/test.csv')
sub_df   = pd.read_csv('data/sample_submission.csv')
info_df  = pd.read_csv('data/data_dict.csv')

train_df.shape, test_df.shape, sub_df.shape, info_df.shape

((341424, 24), (146765, 23), (146765, 2), (24, 2))

In [None]:
train_df.info()

In [5]:
train_df[['booking_date','checkin_date','checkout_date']]

Unnamed: 0,booking_date,checkin_date,checkout_date
0,05/04/18,05/04/18,06/04/18
1,23/01/15,11/04/15,16/04/15
2,28/01/15,01/02/15,05/02/15
3,02/05/15,11/06/15,16/06/15
4,02/09/15,14/12/15,19/12/15
5,01/12/15,12/01/16,13/01/16
6,20/01/16,20/01/16,21/01/16
7,22/08/16,20/10/16,25/10/16
8,10/09/16,18/09/16,24/09/16
9,03/08/16,21/11/16,24/11/16


In [15]:
len(set(train_df['booking_date'].unique())), len(set(test_df['booking_date'].unique())),\
len(set(train_df['booking_date'].unique()).intersection(set(test_df['booking_date'].unique())))

(1621, 1619, 1616)

In [16]:
len(set(train_df['checkin_date'].unique())), len(set(test_df['checkin_date'].unique())),\
len(set(train_df['checkin_date'].unique()).intersection(set(test_df['checkin_date'].unique())))

(1532, 1523, 1521)

In [4]:
train_df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,resort_type_code,room_type_booked_code,roomnights,season_holidayed_code,state_code_residence,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,05/04/18,05/04/18,06/04/18,3,1,2,0,46,3,3,3,1,2.0,7.0,3,3,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,C,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,23/01/15,11/04/15,16/04/15,1,1,2,0,46,3,3,4,5,2.0,7.0,5,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,A,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,28/01/15,01/02/15,05/02/15,1,1,2,0,47,1,5,4,4,2.0,7.0,1,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,E,A,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,02/05/15,11/06/15,16/06/15,1,1,2,2,46,2,2,3,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,02/09/15,14/12/15,19/12/15,1,1,2,0,46,2,2,4,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,7.059346


In [None]:
train_df.apply(lambda x: pd.Series.value_counts(x).shape[0])

# Important points:

- There are only `season_holidayed_code` & `state_code_residence` columns that contains `nan` values.
- All variables are categorical except `ids` (which are `hash code`) and `amount_spent_per_room_night_scaled` (which is `float` and `target`) 

In [None]:
# train_df.groupby('memberid')['amount_spent_per_room_night_scaled'].sum()

In [None]:
info_df

In [None]:
train_df.amount_spent_per_room_night_scaled.plot(kind='hist')

In [None]:
plt.plot(train_df.amount_spent_per_room_night_scaled, '-p')

In [None]:
print(len(set(train_df.memberid)), len(set(test_df.memberid)), \
len(set(train_df.memberid).intersection(set(test_df.memberid))))

print(len(set(train_df.reservation_id)), len(set(test_df.reservation_id)), \
len(set(train_df.reservation_id).intersection(set(test_df.reservation_id))))

In [None]:

import pandas as pd
import numpy as np
import os, gc
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from catboost import Pool, CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb


def train_lgb_model(X_train, y_train, X_valid, y_valid, features, param, X_test, num_round):
    """
    Args:
        X_train, X_valid: training and valid data
        y_train, y_valid: training and valid target
        X_test: test-data
        features: training features
    Return:
        oof-pred, test_preds model, model_imp
    """
    _train = lgb.Dataset(X_train[features], label=y_train, feature_name=list(features))
    _valid = lgb.Dataset(X_valid[features], label=y_valid,feature_name=list(features))
    
    clf = lgb.train(param, _train, num_round, 
                    valid_sets = [_train, _valid], 
                    verbose_eval=200, 
                    early_stopping_rounds = 25)                  
    
    oof = clf.predict(X_valid[features], num_iteration=clf.best_iteration)
    test_pred = clf.predict(X_test[features], num_iteration=clf.best_iteration)
    
    lgb_imp = pd.DataFrame(data=[clf.feature_name(), list(clf.feature_importance())]).T
    lgb_imp.columns = ['feature','imp']
    
    return oof, test_pred, clf, lgb_imp
    



def run_cv_lgb(train_df, target, test_df, leaves=None):

    param = {
        'bagging_freq'           : 5,
        'bagging_fraction'       : 0.33,
        'boost_from_average'     : 'false',
        'boost'                  : 'gbdt',
        'feature_fraction'       : 0.3,
        'learning_rate'          : 0.01,
        'max_depth'              : -1,
        'metric'                 : 'rmse',
        'min_data_in_leaf'       : 100,
#         'min_sum_hessian_in_leaf': 10.0,
        'num_leaves'             : 30,
        'num_threads'            : 4,
        'tree_learner'           : 'serial',
        'objective'              : 'root_mean_squared_error',
        'verbosity'              : 1,
    #     'lambda_l1'              : 0.001,
        'lambda_l2'              : 0.1
    }   
    if leaves is not None:
        param['num_leaves'] = leaves
        print("using leaves: ", param['num_leaves'])

    random_seed = 1234
    n_splits = 3
    num_round = 10000
    feature_imp = pd.DataFrame()
    
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    oof_lgb = np.zeros(len(train_df))
    predictions = np.zeros((len(test_df),n_splits))

    clfs = []
    
    for fold_, (train_index, valid_index) in enumerate(folds.split(train_df, target)):
        print(train_index.shape, valid_index.shape)
        print("Fold {}".format(fold_))
    
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        X_train, X_valid = train_df.iloc[train_index,:], train_df.iloc[valid_index,:]
        features = X_train.columns
        
#         X_train.drop(['disbursal_week','disbursal_day'], axis=1, inplace=True)
#         X_valid.drop(['disbursal_week','disbursal_day'], axis=1, inplace=True)

        num_round = 10000
        oof, test_pred, clf, lgb_imp = train_lgb_model(X_train, y_train, 
                                                       X_valid, y_valid, 
                                                       features, param, 
                                                       test_df, num_round)
        lgb_imp['fold'] = fold_
        feature_imp = pd.concat([feature_imp, lgb_imp], axis=0)
    
        oof_lgb[valid_index] = oof
        predictions[:,fold_] = test_pred
        clfs.append(clf)
        
        score = mean_squared_error(y_valid, oof)
        print( "  rmse = ", 100*np.sqrt(score) )
        print("="*60)
    
    feature_imp.imp = feature_imp.imp.astype('float')
    feature_imp = feature_imp.groupby(['feature'])['imp'].mean()
    feature_imp = pd.DataFrame(data=[feature_imp.index, feature_imp.values]).T
    feature_imp.columns=['feature','imp']
    feature_imp = feature_imp.sort_values(by='imp')

    return clfs, feature_imp, oof_lgb, predictions




In [None]:
import copy

train_df1 = copy.deepcopy(train_df)
test_df1 = copy.deepcopy(test_df)


In [None]:
train_df1.drop(['reservation_id', 'memberid'], axis=1, inplace=True)
test_df1.drop(['reservation_id', 'memberid'], axis=1, inplace=True)

target = train_df.amount_spent_per_room_night_scaled
train_df1.drop('amount_spent_per_room_night_scaled', axis=1, inplace=True)

train_df1 = train_df1.astype('object')
test_df1  = test_df1.astype('object')


In [None]:
complete_df = pd.concat([train_df1, test_df1], axis=0).reset_index(drop=True)

for col in complete_df.columns:
    complete_df[col] = complete_df[col].astype('category').cat.codes
    
del train_df1, test_df1
gc.collect()

train_df1 = complete_df.iloc[:train_df.shape[0]]
test_df1  = complete_df.iloc[train_df.shape[0]:]

gc.collect()

In [74]:
clfs_lgb, imp_lgb, oof_lgb, pred_lgb = run_cv_lgb(train_df1, target, 
                                                  test_df1, leaves=50)


using leaves:  50
(227616,) (113808,)
Fold 0
Training until validation scores don't improve for 25 rounds.
[200]	training's rmse: 1.46449	valid_1's rmse: 1.46804
[400]	training's rmse: 1.03143	valid_1's rmse: 1.0345
[600]	training's rmse: 1.0144	valid_1's rmse: 1.01885
[800]	training's rmse: 1.00885	valid_1's rmse: 1.01489
[1000]	training's rmse: 1.00497	valid_1's rmse: 1.01259
[1200]	training's rmse: 1.00098	valid_1's rmse: 1.01027
[1400]	training's rmse: 0.9973	valid_1's rmse: 1.00813
[1600]	training's rmse: 0.992678	valid_1's rmse: 1.00504
[1800]	training's rmse: 0.990019	valid_1's rmse: 1.0038
[2000]	training's rmse: 0.986463	valid_1's rmse: 1.00165
[2200]	training's rmse: 0.984166	valid_1's rmse: 1.00078
[2400]	training's rmse: 0.981085	valid_1's rmse: 0.99911
[2600]	training's rmse: 0.978644	valid_1's rmse: 0.998053
[2800]	training's rmse: 0.97589	valid_1's rmse: 0.996625
[3000]	training's rmse: 0.973695	valid_1's rmse: 0.995726
[3200]	training's rmse: 0.9711	valid_1's rmse: 0.99

In [65]:
from catboost import Pool, CatBoostClassifier, CatBoostRegressor

def train_cat_model(X_train, y_train, X_valid, y_valid, features, param, X_test, 
                    num_round):
    """
    Args:
        X_train, X_valid: training and valid data
        y_train, y_valid: training and valid target
        X_test: test-data
        features: training features
    Return:
        oof-pred, test_preds, model, model_imp
    """
    param['iterations'] = num_round
    
    _train = Pool(X_train[features], label=y_train)#, cat_features=cate_features_index)
    _valid = Pool(X_valid[features], label=y_valid)#, cat_features=cate_features_index)

    watchlist = [_train, _valid]
    clf = CatBoostRegressor(**param)
    clf.fit(_train, 
            eval_set=watchlist, 
            verbose=200,
            use_best_model=True)
        
    oof  = clf.predict(X_valid[features])
    test_pred  = clf.predict(X_test[features])

    cat_imp = pd.DataFrame(data=[clf.feature_names_, 
                                 list(clf.feature_importances_)]).T
    cat_imp.columns = ['feature','imp']
    
    return oof, test_pred, clf, cat_imp


def run_cv_cat(train_df, target, test_df, depth):

    params = {
        'loss_function'         : "RMSE", 
#         'eval_metric'           : "AUC",
        'random_strength'       : 1.5,
        'border_count'          : 128,
#         'scale_pos_weight'      : 3.507,
        'depth'                 : depth, 
        'early_stopping_rounds' : 50,
        'random_seed'           : 1337,
        'task_type'             : 'CPU', 
#         'subsample'             = 0.7, 
        'iterations'            : 10000, 
        'learning_rate'         : 0.09,
        'thread_count'          : 4
    }


    ##########################
    n_splits = 3
    random_seed = 1234
    feature_imp = pd.DataFrame()
    
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    oof_cat = np.zeros(len(train_df))
    predictions = np.zeros((len(test_df),n_splits))
    clfs = []
##########################
    for fold_, (train_index, valid_index) in enumerate(folds.split(train_df, target)):
        print(train_index.shape, valid_index.shape)
        print("Fold {}".format(fold_))
    
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        X_train, X_valid = train_df.iloc[train_index,:], train_df.iloc[valid_index,:]
        features = X_train.columns
        
        num_rounds = 10000
        oof, test_pred, clf, cat_imp = train_cat_model(X_train, y_train, 
                                                       X_valid, y_valid, 
                                                       features, params, 
                                                       test_df, num_rounds)
    
        oof_cat[valid_index] = oof
        predictions[:,fold_] = test_pred
        
        cat_imp['fold'] = fold_
        feature_imp = pd.concat([feature_imp, cat_imp], axis=0)
        clfs.append(clf)
        
        score = mean_squared_error(y_valid, oof)
        print( "  auc = ", 100*np.sqrt(score) )
        print("="*60)
    
    feature_imp.imp = feature_imp.imp.astype('float')
    feature_imp = feature_imp.groupby(['feature'])['imp'].mean()
    feature_imp = pd.DataFrame(data=[feature_imp.index, feature_imp.values]).T
    feature_imp.columns=['feature','imp']
    feature_imp = feature_imp.sort_values(by='imp')

    return clfs, feature_imp, oof_cat, predictions



In [75]:
clfs_cat, imp_cat, oof_cat, pred_cat = run_cv_cat(train_df1, target, test_df1, 4)


(227616,) (113808,)
Fold 0
0:	learn: 7.1087168	test: 7.1087168	test1: 7.1130523	best: 7.1130523 (0)	total: 20.8ms	remaining: 3m 27s
200:	learn: 1.0182283	test: 1.0182283	test1: 1.0183900	best: 1.0183900 (200)	total: 6.07s	remaining: 4m 56s
400:	learn: 1.0051033	test: 1.0051033	test1: 1.0070166	best: 1.0070166 (400)	total: 12.1s	remaining: 4m 49s
600:	learn: 0.9972679	test: 0.9972679	test1: 1.0008241	best: 1.0008216 (599)	total: 18s	remaining: 4m 41s
800:	learn: 0.9920203	test: 0.9920203	test1: 0.9968622	best: 0.9968622 (800)	total: 23.8s	remaining: 4m 33s
1000:	learn: 0.9881551	test: 0.9881551	test1: 0.9941965	best: 0.9941965 (1000)	total: 29.6s	remaining: 4m 26s
1200:	learn: 0.9847933	test: 0.9847933	test1: 0.9921452	best: 0.9921452 (1200)	total: 35.5s	remaining: 4m 20s
1400:	learn: 0.9824931	test: 0.9824931	test1: 0.9911621	best: 0.9911561 (1399)	total: 41.4s	remaining: 4m 14s
1600:	learn: 0.9803233	test: 0.9803233	test1: 0.9900051	best: 0.9900044 (1599)	total: 47.2s	remaining: 4m 7s

In [85]:

def train_xgb_model(X_train, y_train, X_valid, y_valid, features, param, X_test, 
                    num_round):
    """
    Args:
        X_train, X_valid: training and valid data
        y_train, y_valid: training and valid target
        X_test: test-data
        features: training features
    Return:
        oof-pred, test_preds, model, model_imp
    """
    _train = xgb.DMatrix(X_train[features], label=y_train, feature_names=list(features))
    _valid = xgb.DMatrix(X_valid[features], label=y_valid,feature_names=list(features))
    
    watchlist = [(_valid, 'valid')]
    clf = xgb.train(dtrain=_train, 
                    num_boost_round=num_round, 
                    evals=watchlist,
                    early_stopping_rounds=25, 
                    verbose_eval=200, 
                    params=param)
    
    valid_frame = xgb.DMatrix(X_valid[features],feature_names=list(features))
    oof  = clf.predict(valid_frame, ntree_limit=clf.best_ntree_limit)


    test_frame = xgb.DMatrix(X_test[features],feature_names=list(features))
    test_pred = clf.predict(test_frame, ntree_limit=clf.best_ntree_limit)

    
    xgb_imp = pd.DataFrame(data=[list(clf.get_fscore().keys()), 
                                 list(clf.get_fscore().values())]).T
    xgb_imp.columns = ['feature','imp']
    xgb_imp.imp = xgb_imp.imp.astype('float')
    
    return oof, test_pred, clf, xgb_imp


def run_cv_xgb(train_df, target, test_df, depth):

    features = train_df.columns
    params = {
        'eval_metric'     : 'rmse',
        'seed'            : 1337,
        'eta'             : 0.05,
        'subsample'       : 0.7,
        'colsample_bytree': 0.5,
        'silent'          : 1,
        'nthread'         : 4,
#         'Scale_pos_weight': 3.607,
#         'objective'       : 'reg:squarederror',
        'max_depth'       : depth,
        'alpha'           : 0.05
    }
    
    n_splits = 3
    random_seed = 1234
    feature_imp = pd.DataFrame()
    
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
    oof_xgb = np.zeros(len(train_df))
    predictions = np.zeros((len(test_df),n_splits))
    clfs = []
##########################
    for fold_, (train_index, valid_index) in enumerate(folds.split(train_df, target)):
        print(train_index.shape, valid_index.shape)
        print("Fold {}".format(fold_))
    
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        X_train, X_valid = train_df.iloc[train_index,:], train_df.iloc[valid_index,:]
        features = X_train.columns
        

        num_rounds = 10000
        oof, test_pred, clf, xgb_imp = train_xgb_model(X_train, y_train, 
                                                       X_valid, y_valid, 
                                                       features, params, 
                                                       test_df, num_rounds)
        
        xgb_imp['fold'] = fold_
        feature_imp = pd.concat([feature_imp, xgb_imp], axis=0)
    
        oof_xgb[valid_index] = oof
        predictions[:,fold_] = test_pred
        clfs.append(clf)
        
        score = mean_squared_error(y_valid, oof)
        print( "  auc = ", 100*np.sqrt(score) )
        print("="*60)
    
    feature_imp.imp = feature_imp.imp.astype('float')
    feature_imp = feature_imp.groupby(['feature'])['imp'].mean()
    feature_imp = pd.DataFrame(data=[feature_imp.index, feature_imp.values]).T
    feature_imp.columns=['feature','imp']
    feature_imp = feature_imp.sort_values(by='imp')


    return clfs, feature_imp, oof_xgb, predictions


In [89]:
clfs_xgb, imp_xgb, oof_xgb, pred_xgb = run_cv_xgb(train_df1, target, test_df1, 4)


(227616,) (113808,)
Fold 0
[0]	valid-rmse:6.94846
Will train until valid-rmse hasn't improved in 25 rounds.
[200]	valid-rmse:1.02016
[400]	valid-rmse:1.01442
[600]	valid-rmse:1.01005
[800]	valid-rmse:1.00453
[1000]	valid-rmse:0.999952
[1200]	valid-rmse:0.996541
[1400]	valid-rmse:0.994686
[1600]	valid-rmse:0.992758
[1800]	valid-rmse:0.991263
[2000]	valid-rmse:0.990132
[2200]	valid-rmse:0.989238
[2400]	valid-rmse:0.988517
[2600]	valid-rmse:0.987852
Stopping. Best iteration:
[2596]	valid-rmse:0.987844

  auc =  98.78438239224694
(227616,) (113808,)
Fold 1
[0]	valid-rmse:6.94318
Will train until valid-rmse hasn't improved in 25 rounds.
[200]	valid-rmse:1.02129
[400]	valid-rmse:1.01675
[600]	valid-rmse:1.01156
[800]	valid-rmse:1.00631
[1000]	valid-rmse:1.00209
[1200]	valid-rmse:0.998696
[1400]	valid-rmse:0.9965
[1600]	valid-rmse:0.994389
[1800]	valid-rmse:0.99262
[2000]	valid-rmse:0.991479
[2200]	valid-rmse:0.990637
[2400]	valid-rmse:0.989865
Stopping. Best iteration:
[2461]	valid-rmse:0.98

In [None]:
sns.pairplot(train_df1)

<seaborn.axisgrid.PairGrid at 0x7f43b8b77e10>