In [3]:
#basic tools 
import os
import numpy as np
import pandas as pd
import pickle

#tuning hyperparameters
from bayes_opt import BayesianOptimization  

#building models
import lightgbm as lgbm
from sklearn.model_selection import train_test_split

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
df_train_validation = pd.read_csv("final_train_val.csv", low_memory=False, index_col="id")
df_test = pd.read_csv("final_test.csv", low_memory=False, index_col="id")

  mask |= (ar1 == a)


In [6]:
df_train_validation = reduce_mem_usage(df_train_validation) 
df_test = reduce_mem_usage(df_test) 

Mem. usage decreased to 174.72 Mb (73.8% reduction)
Mem. usage decreased to 30.90 Mb (73.8% reduction)


In [7]:
df_train_validation

Unnamed: 0_level_0,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,TAXI_OUT,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,LATITUDE_origin,LONGITUDE_origin,...,AIRLINE_NK,AIRLINE_OO,AIRLINE_UA,AIRLINE_US,AIRLINE_VX,AIRLINE_WN,scaled_DEPARTURE_TIME,scaled_WHEELS_OFF,scaled_SCHEDULED_TIME,scaled_DISTANCE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,4,98,5,21.0,430,-22.0,61.187500,-150.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.251953,-1.628906,0.921387,1.161133
1,1,1,4,2336,10,12.0,750,-9.0,33.937500,-118.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.626953,-1.629883,1.763672,2.443359
2,1,1,4,840,20,16.0,806,5.0,37.625000,-122.3750,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.607422,-1.605469,1.831055,2.394531
3,1,1,4,258,20,15.0,805,-9.0,33.937500,-118.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.610352,-1.610352,1.820312,2.460938
4,1,1,4,135,25,11.0,320,-21.0,47.437500,-122.3125,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.599609,-1.604492,1.258789,1.161133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,6,30,2,1180,2359,11.0,600,-8.0,33.937500,-118.4375,...,0.0,0.0,1.0,0.0,0.0,0.0,1.256836,-1.635742,1.326172,1.591797
2818549,6,30,2,1192,2359,11.0,520,79.0,33.937500,-118.4375,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.471680,-1.475586,0.876465,1.061523
2818550,6,30,2,1480,2359,8.0,608,107.0,45.593750,-122.6250,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.443359,-1.402344,1.416016,1.708984
2818551,6,30,2,1637,2359,12.0,609,9.0,47.437500,-122.3125,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.614258,-1.617188,1.426758,1.780273


In [8]:
# Split: for LGBM
df_train, df_validation = train_test_split(df_train_validation, test_size=0.20, random_state = 42)
X_train, y_train = df_train.drop("ARRIVAL_DELAY", axis=1), df_train["ARRIVAL_DELAY"]
X_val, y_val = df_validation.drop("ARRIVAL_DELAY", axis=1), df_validation["ARRIVAL_DELAY"]
X_test = df_test

In [41]:
def bayes_parameter_opt_lgbm(X, y, init_round=15, opt_round=25, n_folds=10, random_seed=6, output_process=False):
    # prepare data
    train_data = lgbm.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgbm_eval(learning_rate, num_leaves, num_iterations, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf, min_sum_hessian_in_leaf):
        params = {'application':'regression_l2', 'metric':'mse', 'early_stopping_round': 3, 'verbosity': -1}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params['num_leaves'] = int(round(num_leaves))
        params['num_iterations'] = int(round(num_iterations))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        
        cv_result = lgbm.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, metrics=['l2'])
        return max(cv_result['l2-mean'])
     
    lgbmBO = BayesianOptimization(lgbm_eval, {
        'learning_rate': (0.01, 1.0),
        'num_leaves': (4, 800),
        'num_iterations': (10, 400),
        'feature_fraction': (0.1, 1.0),
        'bagging_fraction': (0.1, 1.0),
        'max_depth': (2, 10),
        'max_bin':(10,200),
        'min_data_in_leaf': (10, 400),
        'min_sum_hessian_in_leaf':(0,400),
    }, random_state=4242)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbmBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_mse=[]
    for model in range(len(lgbmBO.res)):
        model_mse.append(lgbmBO.res[model]['target'])
    
    # return best parameters
    return lgbmBO.res[pd.Series(model_mse).idxmax()]['target'],lgbmBO.res[pd.Series(model_mse).idxmax()]['params']

In [42]:
opt_params = bayes_parameter_opt_lgbm(X_train, y_train, init_round=100, opt_round=100, n_folds=10, random_seed=6)
print(opt_params)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_it... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [18]:
?BayesianOptimization.maximize

[0;31mSignature:[0m
[0mBayesianOptimization[0m[0;34m.[0m[0mmaximize[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minit_points[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_iter[0m[0;34m=[0m[0;36m25[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0macq[0m[0;34m=[0m[0;34m'ucb'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkappa[0m[0;34m=[0m[0;36m2.576[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkappa_decay[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkappa_decay_delay[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mxi[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mgp_params[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Mazimize your function
[0;31mFile:[0m      ~/anaconda3/envs/flight/lib/python3.8/site-packages/bayes_opt/bayesian_optimization.py
[0;31mType

In [10]:
opt_params

(0.8962938098165422,
 {'bagging_fraction': 0.8,
  'feature_fraction': 0.95,
  'learning_rate': 1.0,
  'max_bin': 90.0,
  'max_depth': 30.0,
  'min_data_in_leaf': 20.0,
  'min_sum_hessian_in_leaf': 100.0,
  'num_leaves': 200.0,
  'subsample': 0.01})

In [25]:
with open('opt_params.pickle', 'wb') as f:
    pickle.dump(opt_params, f)