In [1]:
#basic tools 
import os
import numpy as np
import pandas as pd
import pickle

#tuning hyperparameters
from bayes_opt import BayesianOptimization  

#building models
import lightgbm as lgbm
from sklearn.model_selection import train_test_split

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_train_validation = pd.read_csv("final_train_val.csv", low_memory=False, index_col="id")
df_test = pd.read_csv("final_test.csv", low_memory=False, index_col="id")

  mask |= (ar1 == a)


In [4]:
df_train_validation = reduce_mem_usage(df_train_validation) 
df_test = reduce_mem_usage(df_test) 

Mem. usage decreased to 174.72 Mb (73.8% reduction)
Mem. usage decreased to 30.90 Mb (73.8% reduction)


In [5]:
df_train_validation

Unnamed: 0_level_0,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,TAXI_OUT,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,...,AIRLINE_NK,AIRLINE_OO,AIRLINE_UA,AIRLINE_US,AIRLINE_VX,AIRLINE_WN,scaled_DEPARTURE_TIME,scaled_WHEELS_OFF,scaled_SCHEDULED_TIME,scaled_DISTANCE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,4,98,5,21.0,430,-22.0,61.187500,-150.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.251953,-1.628906,0.921387,1.161133
1,1,1,4,2336,10,12.0,750,-9.0,33.937500,-118.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.626953,-1.629883,1.763672,2.443359
2,1,1,4,840,20,16.0,806,5.0,37.625000,-122.3750,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.607422,-1.605469,1.831055,2.394531
3,1,1,4,258,20,15.0,805,-9.0,33.937500,-118.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.610352,-1.610352,1.820312,2.460938
4,1,1,4,135,25,11.0,320,-21.0,47.437500,-122.3125,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.599609,-1.604492,1.258789,1.161133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,6,30,2,1180,2359,11.0,600,-8.0,33.937500,-118.4375,...,0.0,0.0,1.0,0.0,0.0,0.0,1.256836,-1.635742,1.326172,1.591797
2818549,6,30,2,1192,2359,11.0,520,79.0,33.937500,-118.4375,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.471680,-1.475586,0.876465,1.061523
2818550,6,30,2,1480,2359,8.0,608,107.0,45.593750,-122.6250,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.443359,-1.402344,1.416016,1.708984
2818551,6,30,2,1637,2359,12.0,609,9.0,47.437500,-122.3125,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.614258,-1.617188,1.426758,1.780273


In [6]:
# Split: for LGBM
df_train, df_validation = train_test_split(df_train_validation, test_size=0.20, random_state = 42)
X_train, y_train = df_train.drop("ARRIVAL_DELAY", axis=1), df_train["ARRIVAL_DELAY"]
X_val, y_val = df_validation.drop("ARRIVAL_DELAY", axis=1), df_validation["ARRIVAL_DELAY"]
X_test = df_test

In [13]:
def bayes_parameter_opt_lgbm(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6, output_process=False):
    # prepare data
    train_data = lgbm.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgbm_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf, min_sum_hessian_in_leaf, subsample):
        params = {'application':'regression_l2', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params['num_leaves'] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgbm.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval=200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbmBO = BayesianOptimization(lgbm_eval, {'learning_rate': (0.001, 1.0),
                                            'num_leaves': (2, 2**10),
                                            'feature_fraction': (0.1, 1),
                                            'bagging_fraction': (0.1, 1),
                                            'max_depth': (2, 10),
                                            'max_bin':(10,200),
                                            'min_data_in_leaf': (10, 200),
                                            'min_sum_hessian_in_leaf':(0,400),
                                            'subsample': (0.01, 1.0)}, random_state=200)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbmBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbmBO.res)):
        model_auc.append(lgbmBO.res[model]['target'])
    
    # return best parameters
    return lgbmBO.res[pd.Series(model_auc).idxmax()]['target'],lgbmBO.res[pd.Series(model_auc).idxmax()]['params']

In [14]:
opt_params = bayes_parameter_opt_lgbm(X_train, y_train, init_round=5, opt_round=10, n_folds=3, random_seed=6)
print(opt_params)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 998
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 998
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 998
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361




[LightGBM] [Info] Start training from score 5.988886
| [0m 1       [0m | [0m 0.8864  [0m | [0m 0.9529  [0m | [0m 0.2539  [0m | [0m 2.973   [0m | [0m 91.38   [0m | [0m 76.89   [0m | [0m 10.54   [0m | [0m 143.0   [0m | [0m 364.1   [0m | [0m 0.4615  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 459
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 459
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 459
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29




[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [0m 2       [0m | [0m 0.8823  [0m | [0m 0.9836  [0m | [0m 0.8306  [0m | [0m 4.93    [0m | [0m 185.4   [0m | [0m 31.76   [0m | [0m 170.8   [0m | [0m 48.49   [0m | [0m 314.7   [0m | [0m 0.258   [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1127
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1127
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1127
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
[LightGBM] [Info] Start training from score 5.991501




[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [95m 3       [0m | [95m 0.9128  [0m | [95m 0.1864  [0m | [95m 0.8991  [0m | [95m 4.13    [0m | [95m 108.5   [0m | [95m 87.6    [0m | [95m 119.9   [0m | [95m 180.0   [0m | [95m 272.7   [0m | [95m 0.4252  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29




[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [0m 4       [0m | [0m 0.8754  [0m | [0m 0.6764  [0m | [0m 0.5978  [0m | [0m 0.5281  [0m | [0m 193.5   [0m | [0m 76.35   [0m | [0m 139.1   [0m | [0m 131.7   [0m | [0m 12.53   [0m | [0m 0.8056  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.






You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29




[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [0m 5       [0m | [0m 0.8221  [0m | [0m 0.9755  [0m | [0m 0.3046  [0m | [0m 3.246   [0m | [0m 60.46   [0m | [0m 20.39   [0m | [0m 89.62   [0m | [0m 266.4   [0m | [0m 207.9   [0m | [0m 0.8559  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.






You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 567
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 567
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 567
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361




[LightGBM] [Info] Start training from score 5.988886
| [0m 6       [0m | [0m 0.8604  [0m | [0m 0.9346  [0m | [0m 0.284   [0m | [0m 4.531   [0m | [0m 154.8   [0m | [0m 41.24   [0m | [0m 146.6   [0m | [0m 130.3   [0m | [0m 64.73   [0m | [0m 0.04065 [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.






You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 555
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29




[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [0m 7       [0m | [0m 0.8923  [0m | [0m 0.5922  [0m | [0m 0.4519  [0m | [0m 4.294   [0m | [0m 189.0   [0m | [0m 39.93   [0m | [0m 118.1   [0m | [0m 174.9   [0m | [0m 255.3   [0m | [0m 0.8433  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.






You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29




[LightGBM] [Info] Start training from score 5.991501
[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886
| [0m 8       [0m | [0m 0.9109  [0m | [0m 0.3203  [0m | [0m 0.7826  [0m | [0m 4.301   [0m | [0m 138.7   [0m | [0m 81.32   [0m | [0m 114.5   [0m | [0m 147.4   [0m | [0m 300.1   [0m | [0m 0.57    [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 1503228, number of used features: 29
[LightGBM] [Info] Start training from score 5.991501




[LightGBM] [Info] Start training from score 5.989361
[LightGBM] [Info] Start training from score 5.988886


KeyboardInterrupt: 

In [10]:
opt_params

(0.8962938098165422,
 {'bagging_fraction': 0.8,
  'feature_fraction': 0.95,
  'learning_rate': 1.0,
  'max_bin': 90.0,
  'max_depth': 30.0,
  'min_data_in_leaf': 20.0,
  'min_sum_hessian_in_leaf': 100.0,
  'num_leaves': 200.0,
  'subsample': 0.01})

In [25]:
with open('opt_params.pickle', 'wb') as f:
    pickle.dump(opt_params, f)