In [1]:
import gc
import time
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.preprocessing import OneHotEncoder

warnings.filterwarnings('ignore')

In [2]:
%%time
PATH="../input/"
train = pd.read_csv(PATH+"train.csv")
test = pd.read_csv(PATH+"test.csv")
subm_df = pd.read_csv(PATH+"sample_submission.csv")

CPU times: user 3.52 s, sys: 288 ms, total: 3.8 s
Wall time: 3.85 s


In [3]:
targetcol = 'amount_spent_per_room_night_scaled'
target = train[targetcol]
# del train[targetcol]

In [4]:
train.head()
# train['booking_date'].dtype.name=='object'

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,resort_type_code,room_type_booked_code,roomnights,season_holidayed_code,state_code_residence,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,05/04/18,05/04/18,06/04/18,3,1,2,0,46,3,3,3,1,2.0,7.0,3,3,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,C,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,23/01/15,11/04/15,16/04/15,1,1,2,0,46,3,3,4,5,2.0,7.0,5,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,A,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,28/01/15,01/02/15,05/02/15,1,1,2,0,47,1,5,4,4,2.0,7.0,1,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,E,A,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,02/05/15,11/06/15,16/06/15,1,1,2,2,46,2,2,3,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,02/09/15,14/12/15,19/12/15,1,1,2,0,46,2,2,4,5,2.0,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,7.059346


In [5]:
#preprocessing
def preprocdate(data,coltemplate):
    col = coltemplate+'_date'
    
    if   data[col].dtype.name=='object':
        print('Date type conversion start')
        data[col]=pd.to_datetime(data[col],format='%d/%m/%y')
        print('Date type conversion complete')
       
#     data[col]=pd.to_datetime(data[col],infer_datetime_format=True)
    data[coltemplate+'_month']=data[col].dt.month
    data[coltemplate+'_year']=data[col].dt.year
#     data[coltemplate+'_day']=data[col].dt.day
    data[coltemplate+'_dayofweek']=data[col].dt.dayofweek
    data[coltemplate+'_weekend'] = (data[coltemplate+'_dayofweek'] >= 5).astype('int')
    
    return data

def onehotenc(data,cols):
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto',n_values='auto',
                                  categorical_features='all')
    onehot_encoded = onehot_encoder.fit_transform(data[cols])

    enc_feats = onehot_encoder.get_feature_names(input_features=cols)
    
    for i,col in enumerate(enc_feats):
        data[col] = onehot_encoded[:,i]
    
    return data
    
#factorize string data
def preproc(train,test):
    
    train['istrain']=1
    test['istrain']=0
    combined = pd.concat([train,test])
    
    fact_cols = ['member_age_buckets','cluster_code','reservationstatusid_code','resort_id']
    
    print('Factorize Start...')
    for col in fact_cols:
        print('col:',col)
        combined[col],indexer=pd.factorize(combined[col])
    
#     date_cols = ['checkin_date','checkout_date','booking_date']
    print('Date Preproc Start...')
    date_cols = ['checkin','checkout','booking']
    for col in date_cols:
        print('col:',col)
        combined = preprocdate(combined,col)
    
    #replace NaN with -1
#     nan_cols = ['season_holidayed_code']
#     for col in nan_cols:
#         combined[col].fillna(-1,inplace=True)
    
#     ohe_cols =['channel_code','main_product_code','resort_region_code',
#               'resort_type_code','room_type_booked_code','season_holidayed_code',
#               'member_age_buckets','cluster_code','reservationstatusid_code']
#     combined = onehotenc(combined,ohe_cols)
    
    train = combined[combined['istrain']==1]
    test = combined[combined['istrain']==0]
    
    del train['istrain'],test['istrain'], combined; gc.collect()
    
    return train,test

In [6]:
train,test=preproc(train,test)

Factorize Start...
col: member_age_buckets
col: cluster_code
col: reservationstatusid_code
col: resort_id
Date Preproc Start...
col: checkin
Date type conversion start
Date type conversion complete
col: checkout
Date type conversion start
Date type conversion complete
col: booking
Date type conversion start
Date type conversion complete


In [7]:
def getenccolname(colname,cols_agg):
    if 'var' in cols_agg:
        colname ="targetvarenc_"+colname
    elif 'std' in cols_agg:
        colname ="targetstdenc_"+colname 
    elif 'sum' in cols_agg:   
        colname ="targetsumenc_"+colname
    elif 'min' in cols_agg:   
        colname ="targetminenc_"+colname
    elif 'max' in cols_agg:   
        colname ="targetmaxenc_"+colname
    elif 'median' in cols_agg:   
        colname ="targetmedianenc_"+colname
    elif 'count' in cols_agg:   
        colname ="targetcountenc_"+colname
    elif 'iqmean' in cols_agg:   
        colname ="targetiqmeanenc_"+colname
    else:
        colname ="targetenc_"+colname
        
    return colname

def droptargetenccols(train, val,test):
     #remove target encoding fields if present
    targetenccols = [col for col in train.columns if ('targetenc' in col) or ('targetstdenc' in col)]
    train.drop(targetenccols,axis=1,inplace=True)
    val.drop(targetenccols,axis=1,inplace=True)
    targetenccols_test = [col for col in test.columns if ('targetenc' in col) or ('targetstdenc' in col)]
    test.drop(targetenccols_test,axis=1,inplace=True)
    
    return train, val,test

In [8]:
def impact_coding(data, feature, target, n_folds=20, n_inner_folds=10):
    from sklearn.model_selection import StratifiedKFold
    impact_coded = pd.Series()
        
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=8888) 
    oof_mean_cv = pd.DataFrame()
    split = 0
    print('col:',feature)
#     print()
    for outer_, (infold, oof) in enumerate(kf.split(data[feature], data[target])):
#         print('outer fold:{0} '.format(outer_))

        kf_inner = KFold(n_splits=n_inner_folds, shuffle=True, random_state=8888)
        inner_split = 0
        inner_oof_mean_cv = pd.DataFrame()
        oof_default_inner_mean = data.iloc[infold][target].mean()
        
        tr_outer = data.iloc[infold]
        
        for inner_,(infold_inner, oof_inner) in enumerate(kf_inner.split(data.iloc[infold], data.loc[infold, target])):
                    
            # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
#             oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
            oof_mean = tr_outer.iloc[infold_inner].groupby(by=feature)[target].mean()
            
            # Also populate mapping (this has all group -> mean for all inner CV folds)
            inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
            inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
            inner_split += 1

        # compute mean for each value of categorical value across oof iterations
        inner_oof_mean_cv_map = inner_oof_mean_cv.mean(axis=1)

        # Also populate mapping
        oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
        oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True) # <- local mean as default
        split += 1

        feature_mean = data.loc[oof, feature].map(inner_oof_mean_cv_map).fillna(oof_default_inner_mean)
        impact_coded = impact_coded.append(feature_mean)
    
    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean



def encode_target_cv(tr, val, test, targetcolname, categ_variables, impact_coder=impact_coding,
                    n_folds=20, n_inner_folds=10):
    """Apply original function for each <categ_variables> in  <data>
    Reduced number of validation folds
    """
    train_target = tr.copy() 
    
    code_map = dict()
    default_map = dict()
    for f in categ_variables:
        enccol_mean = getenccolname(f,'mean')
        train_target.loc[:, enccol_mean], code_map[f], default_map[f] = impact_coder(train_target, f, 
                                                                                     targetcolname,
                                                                                    n_folds=n_folds, 
                                                                                     n_inner_folds=n_inner_folds)
        val.loc[:, enccol_mean] = val[f].map(code_map[f]).fillna(default_map[f])
        test.loc[:, enccol_mean] = test[f].map(code_map[f]).fillna(default_map[f])
        
        
#     return train_target, code_map, default_map
    return train_target,val,test


def impact_coding(data, feature, target, n_folds=20, n_inner_folds=10):
    from sklearn.model_selection import StratifiedKFold
    impact_coded = pd.Series()
        
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=8888) 
    oof_mean_cv = pd.DataFrame()
    split = 0
    print('col:',feature)
#     print()
    for outer_, (infold, oof) in enumerate(kf.split(data[feature], data[target])):
#         print('outer fold:{0} '.format(outer_))

        kf_inner = KFold(n_splits=n_inner_folds, shuffle=True, random_state=8888)
        inner_split = 0
        inner_oof_mean_cv = pd.DataFrame()
        oof_default_inner_mean = data.iloc[infold][target].mean()
        
        tr_outer = data.iloc[infold]
        
        for inner_,(infold_inner, oof_inner) in enumerate(kf_inner.split(data.iloc[infold], data.loc[infold, target])):
                    
            # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
#             oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
            oof_mean = tr_outer.iloc[infold_inner].groupby(by=feature)[target].mean()
            
            # Also populate mapping (this has all group -> mean for all inner CV folds)
            inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
            inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
            inner_split += 1

        # compute mean for each value of categorical value across oof iterations
        inner_oof_mean_cv_map = inner_oof_mean_cv.mean(axis=1)

        # Also populate mapping
        oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
        oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True) # <- local mean as default
        split += 1

        feature_mean = data.loc[oof, feature].map(inner_oof_mean_cv_map).fillna(oof_default_inner_mean)
        impact_coded = impact_coded.append(feature_mean)
    
    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean



def encode_target_cv_group(tr, val, test, targetcolname, categ_variables, impact_coder=impact_coding,
                    n_folds=20, n_inner_folds=10):
    """Apply original function for each <categ_variables> in  <data>
    Reduced number of validation folds
    """
    train_target = tr.copy() 
    
    code_map = dict()
    default_map = dict()
    for f in categ_variables:
        enccol_mean = 'targetenc_groupmean'
        train_target.loc[:, enccol_mean], code_map[f], default_map[f] = impact_coder(train_target, categ_variables, 
                                                                                     targetcolname,
                                                                                    n_folds=n_folds, 
                                                                                     n_inner_folds=n_inner_folds)
        val.loc[:, enccol_mean] = val[f].map(code_map[f]).fillna(default_map[f])
        test.loc[:, enccol_mean] = test[f].map(code_map[f]).fillna(default_map[f])
        
        
#     return train_target, code_map, default_map
    return train_target,val,test

In [9]:
def gen_targetencode(train,target,test,n_splits,catcolnames,targetcol):
#                     ,smoothing,min_samples_leaf,noise_level):

    start = time.time()
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=4590)
    indices = folds.split(train.values, target.values)

    test_index = test.index

    for fold_, (trn_idx, val_idx) in enumerate(indices):
        print('******************************************************')
        print("FOLD  ---  {}".format(fold_))
        print('******************************************************')

        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        
        tr_index = tr.index
        val_index = val.index
        #drop any existing target enc cols
        tr,val,test = droptargetenccols(tr,val,test)

#         #target encoding on transaction merchant id
#         tr,val,test = targetencode(tr, val,test,catcolnames,targetcol,
#                                     smoothing,min_samples_leaf,noise_level)

        tr,val,test = encode_target_cv(tr.reset_index(drop=True), val.reset_index(drop=True), 
                                       test.reset_index(drop=True), 
                                       targetcol, catcolnames,
                                       n_folds=5, n_inner_folds=2
                                      )

        tr.index = tr_index
        val.index = val_index
        test.index = test_index

        targetenccols = [col for col in tr.columns if 'targetenc' in col]
        tr['targetenc_mean'] = tr[targetenccols].mean(axis=1)
        val['targetenc_mean'] = val[targetenccols].mean(axis=1)        
        test['targetenc_mean'] = test[targetenccols].mean(axis=1)        

        tr['targetenc_std'] = tr[targetenccols].std(axis=1)
        val['targetenc_std'] = val[targetenccols].std(axis=1)        
        test['targetenc_std'] = test[targetenccols].std(axis=1) 
        
        tr['targetenc_min'] = tr[targetenccols].min(axis=1)
        val['targetenc_min'] = val[targetenccols].min(axis=1)        
        test['targetenc_min'] = test[targetenccols].min(axis=1) 
        
        tr['targetenc_max'] = tr[targetenccols].max(axis=1)
        val['targetenc_max'] = val[targetenccols].max(axis=1)        
        test['targetenc_max'] = test[targetenccols].max(axis=1) 
        
        enc_cols = [col for col in tr.columns if 'targetenc' in col]
        print('enc cols:',enc_cols)
        print('save encoding feats...')

        #save target encoding features in separate file
        tr[enc_cols].to_csv('train_targetenc_feats'+str(fold_)+'.csv')
        val[enc_cols].to_csv('val_targetenc_feats'+str(fold_)+'.csv')
        test[enc_cols].to_csv('test_targetenc_feats'+str(fold_)+'.csv')


    end = time.time()
    print('Target Enc Execution Time:',end-start)

In [10]:
# %%time
# #generate group column for target encoding
# for df in [train,test]:
#     df['groupcol'] = df['resort_id'].astype('str').str.cat([df['persontravellingid'].astype('str'),
#                             df['main_product_code'].astype('str'),
#                             df['room_type_booked_code'].astype('str'),
#                             df['state_code_residence'].astype('str')],
#                             sep='_')

In [11]:
# print(train['groupcol'].nunique())
# print(test['groupcol'].nunique())

# test_val = test['groupcol'].unique()
# train_val = train['groupcol'].unique()

# test_m_train = set(test_val).difference(train_val)
# print(len(test_m_train))
# print(test[test['groupcol'].isin(list(test_m_train))].shape)

In [12]:
# temp=train['groupcol'].value_counts()
# print(temp.head(10))
# print(temp.tail(5000))

In [13]:
# n_splits = 5
# catcolnames= ['groupcol']
# # catcolnames= ['resort_id', 'state_code_residence','state_code_resort']
# # othercatcols =['channel_code','main_product_code','resort_region_code',
# #               'resort_type_code','room_type_booked_code','season_holidayed_code',
# #               'member_age_buckets','cluster_code','reservationstatusid_code']
# # catcolnames +=othercatcols
# gen_targetencode(train,target,test,n_splits,catcolnames,targetcol)

In [14]:
def getenc():
    tr_encs = []
    val_encs = []
    test_encs = []
    
    Path=''

    for i in range(0,5):
        cur_tr_enc = pd.read_csv(Path+'train_targetenc_feats'+str(i)+'.csv',index_col=0)
        cur_val_enc = pd.read_csv(Path+'val_targetenc_feats'+str(i)+'.csv',index_col=0)

        tr_encs += [cur_tr_enc]
        val_encs +=[ cur_val_enc]

        test_encs += [pd.read_csv(Path+'test_targetenc_feats'+str(i)+'.csv',index_col=0)]
        print('read complete for:',i)
        
    return tr_encs,val_encs,test_encs

In [15]:
# tr_encs, val_encs,test_encs = getenc()
# print(tr_encs[0].shape)
# print(val_encs[0].shape)
# print(test_encs[0].shape)

In [16]:
def runlgb(train,test,target,param,cur_features):

    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    valid_scores =[]
    fold_importance_df = pd.DataFrame()
    
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=4590)
    indices =  folds.split(train.values,target.values)   
        
#     folds = GroupKFold(n_splits=n_splits)
#     indices =  folds.split(train.values,target.values,train['memberid'].values)   
    for fold_, (trn_idx, val_idx) in enumerate(indices):
        print()
        print("fold n°{}".format(fold_))

        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        y_val = target.iloc[val_idx]
        y_tr = target.iloc[trn_idx]
        
        test_cur = test.copy()
        
        
#         cur_tr_encs = tr_encs[fold_]
#         cur_val_encs= val_encs[fold_]

#         print('val shape bef:',val.shape)
#         print('tr shape bef:',tr.shape)
#         print('test shape bef:',test.shape)
        
#         tr=pd.concat([tr,cur_tr_encs],axis=1)
#         val=pd.concat([val,cur_val_encs],axis=1)
#         test_cur=pd.concat([test,test_encs[fold_]],axis=1)
                
#         print('val shape after:',val.shape)
#         print('tr shape after:',tr.shape)
#         print('test shape after:',test.shape)    
        
        
        trn_data = lgb.Dataset(tr[cur_features], label=y_tr)#,, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val[cur_features], label=y_val)#,, categorical_feature=categorical_feats)
        
        clf = lgb.train(param, trn_data, num_round, valid_sets = [val_data], 
                        verbose_eval=500, early_stopping_rounds = 200)

        #Prediction based on current fold selected features
        oof[val_idx] = clf.predict(val[cur_features], num_iteration=clf.best_iteration)
        
        fold_importance_df["feature"] = cur_features
        if fold_==0:
            fold_importance_df["importance"] =0
        fold_importance_df["importance"] += clf.feature_importance() / n_splits
        valid_scores+=[clf.best_score['valid_0'][param['metric']]]
        predictions += clf.predict(test_cur[cur_features], num_iteration=clf.best_iteration) / folds.n_splits

    print('valid scores:',valid_scores)
    print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
    
    return fold_importance_df,predictions,oof

In [17]:
# param = {'objective':'regression',
#          'num_leaves': 31,
#          'min_data_in_leaf': 25,
#          'max_depth': 7,
#          'learning_rate': 0.01,
#          'lambda_l1':0.13,
#          "boosting": "gbdt",
#          "feature_fraction":0.85,
#          'bagging_freq':8,
#          "bagging_fraction": 0.9 ,
#          "metric": 'rmse',
#          "verbosity": -1,
#          'n_estimators': 10000,
#          "random_state": 2333}

In [18]:
# param = {'colsample_bytree': 0.7435507072475522,
#    'min_child_samples': 200,
#    'num_leaves': 33,
#    'reg_alpha': 0.30232162973796833,
#    'reg_lambda': 0.2679669294245453,
#    'subsample': 0.807559171733078,
#    'subsample_for_bin': 130000,
#    'learning_rate': 0.01,
#    'boosting': 'gbdt',
#    'bagging_seed': 2018,
#    'bagging_freq': 2,
#    'min_data_in_bin': 100,
#    'n_estimators': 10000,
#    'objective': 'regression',
#    'metric': 'rmse',
#    'random_state': 2333,
#    'max_depth': 15,
#    'scale_pos_weight': 1}

In [19]:
param = {'colsample_bytree': 0.6003677943112755,
   'min_child_samples': 200,
   'num_leaves': 37,
   'reg_alpha': 0.4673020868826969,
   'reg_lambda': 0.327996673259906,
   'subsample': 0.9327220260448092,
   'subsample_for_bin': 80000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'bagging_freq': 2,
   'min_data_in_bin': 100,
   'n_estimators': 10000,
   'objective': 'regression',
   'metric': 'rmse',
   'random_state': 2333,
   'max_depth': 15,
        }

In [20]:
# #Model category cols
# # cat_cols = ['booking_type_code', 'channel_code', 'cluster_code', 'main_product_code', 
# #             'member_age_buckets','persontravellingid', 'reservationstatusid_code', 'resort_id', 
# #             'resort_region_code', 'resort_type_code', 'room_type_booked_code',
# #            'season_holidayed_code', 'state_code_residence', 'state_code_resort',
# # #            'booking_month', 'booking_year'
# #            ]

# cat_cols = ['resort_id']

# for col in cat_cols:
#     for df in [train,test]:
#         df[col]=df[col].astype('category')

In [21]:
# train.dtypes

In [22]:
# train['memberid'].head(30)

In [23]:
def fill_fewvisitmembers(col,data):
    global_mean = data[col].mean()
    data.loc[data['member_size']<=5,col]=global_mean
#     data.loc[data['member_size']<=3,col]=np.nan
    return data

for df in [train,test]:
    cur_df = df.sort_values(['checkin_date'])
    df['member_repeat'] = cur_df.groupby(['memberid'])['checkin_date'].cumcount()
    df['member_resort_repeat'] = cur_df.groupby(['memberid','resort_id'])['checkin_date'].cumcount()
#     df['member_size'] = cur_df.groupby(['memberid'])['memberid'].transform('count')
    
    df['member_roomnights_mean'] = cur_df.groupby(['memberid'])['roomnights'].transform('mean')
#     df['member_roomnights_std'] = cur_df.groupby(['memberid'])['roomnights'].transform('std')
#     df['member_roomnights_min'] = cur_df.groupby(['memberid'])['roomnights'].transform('min')
#     df['member_roomnights_max'] = cur_df.groupby(['memberid'])['roomnights'].transform('max')
    df['member_resort_roomnights_mean'] = cur_df.groupby(['memberid','resort_id'])['roomnights'].transform('mean')
    
    df['member_numberofadults_mean'] = cur_df.groupby(['memberid'])['numberofadults'].transform('mean')
#     df['member_numberofadults_std'] = cur_df.groupby(['memberid'])['numberofadults'].transform('std')
#     df['member_numberofadults_min'] = cur_df.groupby(['memberid'])['numberofadults'].transform('min')
#     df['member_numberofadults_max'] = cur_df.groupby(['memberid'])['numberofadults'].transform('max')
    df['member_resort_numberofadults_mean'] = cur_df.groupby(['memberid','resort_id'])['numberofadults'].transform('mean')
    
    df['member_total_pax_mean'] = cur_df.groupby(['memberid'])['total_pax'].transform('mean')
#     df['member_total_pax_std'] = cur_df.groupby(['memberid'])['total_pax'].transform('std')
#     df['member_total_pax_min'] = cur_df.groupby(['memberid'])['total_pax'].transform('min')
#     df['member_total_pax_max'] = cur_df.groupby(['memberid'])['total_pax'].transform('max')
    df['member_resort_total_pax_mean'] = cur_df.groupby(['memberid','resort_id'])['total_pax'].transform('mean')
#     df['state_same'] = (df['state_code_residence']==df['state_code_resort']).astype('int')

#     df =fill_fewvisitmembers('member_roomnights_std',df)
#     df =fill_fewvisitmembers('member_numberofadults_std',df)
#     df =fill_fewvisitmembers('member_total_pax_std',df)



In [24]:
# print(train['member_size'].describe())
# print(train['member_roomnights_std'].describe())
# # print(train['member_roomnights_std'].describe())
# # global_mean = train['member_roomnights_std'].mean()
# # print('global mean shape:',train[train['member_roomnights_std']==global_mean].shape)

# # print(train[train['member_roomnights_std'].isnull()].shape)
# print(train.loc[train['member_size']==1,'member_roomnights_std'].shape)

In [25]:
# train[['memberid','member_roomnights_mean','roomnights','member_resort_roomnights_mean','resort_id']].head(20)

In [26]:
exclude_cols = ['reservation_id','memberid',
               'booking_date', 'checkin_date', 'checkout_date',
                'groupcol',
                'group_count',
                'state_same',
                'member_roomnights_std','member_numberofadults_std','member_total_pax_std',
                'member_size',
               targetcol]
# ohe_cols =['channel_code','main_product_code','resort_region_code',
#               'resort_type_code','room_type_booked_code','season_holidayed_code',
#               'member_age_buckets','cluster_code','reservationstatusid_code']
# exclude_cols += ohe_cols

# targetenccols  = [col for col in tr_encs[0].columns]
features = [col for col in train.columns if col not in exclude_cols]
# features += ['targetenc_groupcol']
# features += ['targetenc_mean','targetenc_std','targetenc_min','targetenc_max']
print('Length of features:',len(features))
print()
print(features)

Length of features: 38

['booking_type_code', 'channel_code', 'cluster_code', 'main_product_code', 'member_age_buckets', 'numberofadults', 'numberofchildren', 'persontravellingid', 'reservationstatusid_code', 'resort_id', 'resort_region_code', 'resort_type_code', 'room_type_booked_code', 'roomnights', 'season_holidayed_code', 'state_code_residence', 'state_code_resort', 'total_pax', 'checkin_month', 'checkin_year', 'checkin_dayofweek', 'checkin_weekend', 'checkout_month', 'checkout_year', 'checkout_dayofweek', 'checkout_weekend', 'booking_month', 'booking_year', 'booking_dayofweek', 'booking_weekend', 'member_repeat', 'member_resort_repeat', 'member_roomnights_mean', 'member_resort_roomnights_mean', 'member_numberofadults_mean', 'member_resort_numberofadults_mean', 'member_total_pax_mean', 'member_resort_total_pax_mean']


In [27]:
# corr_feats = features.copy()
# corr_feats +=[targetcol]
# corr_feats.remove('targetenc_groupcol')
# train[corr_feats].corr()

In [28]:
%%time
n_splits=5
param['n_estimators']= 8000
num_round = param['n_estimators']
fold_importance_df,predictions,oof = runlgb(train,test,target,param,features)


fold n°0
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's rmse: 1.01213
[1000]	valid_0's rmse: 0.995839
[1500]	valid_0's rmse: 0.990182
[2000]	valid_0's rmse: 0.987472
[2500]	valid_0's rmse: 0.985744
[3000]	valid_0's rmse: 0.984387
[3500]	valid_0's rmse: 0.983302
[4000]	valid_0's rmse: 0.982511
[4500]	valid_0's rmse: 0.981959
[5000]	valid_0's rmse: 0.98152
[5500]	valid_0's rmse: 0.981319
[6000]	valid_0's rmse: 0.981163
[6500]	valid_0's rmse: 0.980921
[7000]	valid_0's rmse: 0.980738
[7500]	valid_0's rmse: 0.980642
[8000]	valid_0's rmse: 0.980564
Did not meet early stopping. Best iteration is:
[7987]	valid_0's rmse: 0.980562

fold n°1
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's rmse: 1.01457
[1000]	valid_0's rmse: 0.996124
[1500]	valid_0's rmse: 0.990772
[2000]	valid_0's rmse: 0.988042
[2500]	valid_0's rmse: 0.986613
[3000]	valid_0's rmse: 0.985663
[3500]	valid_0's rmse: 0.984747
[4000]	valid_0's rmse: 0.983986
[4500]	vali

In [29]:
fold_importance_df['ratio'] = fold_importance_df['importance'] / fold_importance_df['importance'].sum()
fold_importance_df.sort_values('feature',ascending=True,inplace=True)
# fold_importance_df.sort_index(inplace=True)

# fold_importance_df.sort_values('importance',ascending=False,inplace=True)
fold_importance_df

Unnamed: 0,feature,importance,ratio
28,booking_dayofweek,7469.0,0.026052
26,booking_month,9158.4,0.031944
0,booking_type_code,1145.8,0.003996
29,booking_weekend,413.8,0.001443
27,booking_year,4442.4,0.015495
1,channel_code,4821.6,0.016818
20,checkin_dayofweek,10899.8,0.038018
18,checkin_month,7770.2,0.027102
21,checkin_weekend,661.2,0.002306
19,checkin_year,4183.8,0.014593


In [30]:
sub_df = pd.DataFrame({"reservation_id":test["reservation_id"].values})
sub_df[targetcol] = predictions
sub_df.to_csv("submission_membergroupcount.csv", index=False)
np.save('oof_membergroupcount.npy',oof)