In [None]:
# !pip install xgboost

In [None]:
!pip install hyperopt

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
# from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import joblib

In [None]:
# Display the dataframe
pd.set_option('display.max_columns', 100)  # or 1000
pd.set_option('display.max_rows', 100)  # or 1000
pd.set_option('display.max_colwidth', 100)  # or 199

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train_validation = pd.read_csv("final2_train_val.csv", low_memory=False, index_col="id")
df_test = pd.read_csv("final2_test.csv", low_memory=False, index_col="id")

In [None]:
X_train_val, y_train_val = df_train_validation.drop("ARRIVAL_DELAY", axis=1), df_train_validation["ARRIVAL_DELAY"]
X_test = df_test

In [None]:
df_train_validation.columns

In [None]:
df_test.head()

In [None]:
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(df_train_validation)

In [None]:
# Split: for XGBoost
df_train, df_validation = train_test_split(df_train_validation, test_size=0.20, random_state = 42)
X_train, y_train = df_train.drop("ARRIVAL_DELAY", axis=1), df_train["ARRIVAL_DELAY"]
X_val, y_val = df_validation.drop("ARRIVAL_DELAY", axis=1), df_validation["ARRIVAL_DELAY"]

In [None]:
X_test = df_test

## XGBoost Regressor

#### Without tuning: using preprocessing version 2

In [None]:
df_train_validation = pd.read_csv("final2_train_val.csv", low_memory=False, index_col="id")
X_train_val, y_train_val = df_train_validation.drop("ARRIVAL_DELAY", axis=1), df_train_validation["ARRIVAL_DELAY"]
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train_val,y_train_val)
joblib.dump(xg_reg, "xgreg-without-tuning.pkl")

In [None]:
X_test = pd.read_csv("final2_test.csv", low_memory=False, index_col="id")
eval_pred = xg_reg.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

In [None]:
import matplotlib.pyplot as plt
ft_importances = pd.Series(xg_reg.feature_importances_, index=X_train_val.columns)
print(ft_importances)
ft_importances.nlargest(10).plot(kind='barh')
plt.show()

### Hyperparameter tuning

In [None]:
# Hyperparameter tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

def objective(space):
    reg = xgb.XGBRegressor(n_estimators =space['n_estimators'], 
                           max_depth = int(space['max_depth']),
                           gamma = space['gamma'], 
                           reg_alpha = int(space['reg_alpha']),
                           min_child_weight=int(space['min_child_weight']),
                           colsample_bytree=int(space['colsample_bytree']))

    eval_set  = [(X_train, y_train), (X_val, y_val)]

    reg.fit(X_train, y_train, eval_set=eval_set, eval_metric = 'rmse',
            early_stopping_rounds=10,verbose=False)
    val_pred = reg.predict(X_val)
    mse = mean_squared_error(y_val, val_pred)
    return{'loss':mse, 'status': STATUS_OK }

trials = Trials()
best_hyperparams = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print(best_hyperparams)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
best_hyperparams = {'colsample_bytree': 0.8443443748974826, 'gamma': 8.265472386809305, 'max_depth': 14, 'min_child_weight': 10, 'reg_alpha': 164.0, 'reg_lambda': 0.4112998083884938}

xg_reg = xgb.XGBRegressor(**best_hyperparams)
xg_reg.fit(X_train,y_train)
val_pred = xg_reg.predict(X_val)
mse = mean_squared_error(y_val, val_pred)
print(mse)

# 100.04

In [None]:
best_hyperparams = {'colsample_bytree': 0.8443443748974826, 'gamma': 8.265472386809305, 'max_depth': 14, 'min_child_weight': 10, 'reg_alpha': 164.0, 'reg_lambda': 0.4112998083884938}
X_train_val, y_train_val = df_train_validation.drop("ARRIVAL_DELAY", axis=1), df_train_validation["ARRIVAL_DELAY"]
xg_reg = xgb.XGBRegressor(**best_hyperparams)
xg_reg.fit(X_train_val,y_train_val)

In [None]:
joblib.dump(xg_reg, "xgreg.pkl")

In [None]:
eval_pred = xg_reg.predict(X_test)

In [None]:
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

## Gradient Boosting Regressor

In [None]:
df_train, df_validation = train_test_split(df_train_validation, test_size=0.20, random_state = 0)
X_train, y_train = df_train.drop("ARRIVAL_DELAY", axis=1), df_train["ARRIVAL_DELAY"]
X_val, y_val = df_validation.drop("ARRIVAL_DELAY", axis=1), df_validation["ARRIVAL_DELAY"]

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
val_pred = xg_reg.predict(X_val)
mse = mean_squared_error(y_val, val_pred)
print(mse)

In [None]:
gbr.fit(X_train_val, y_train_val)
joblib.dump(gbr, "gradient_boosting_regressor.pkl")

In [None]:
eval_pred = gbr.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

## Light GBM

In [None]:
# Light GBM
lgbm = LGBMRegressor()
xg_reg.fit(X_train, y_train)
val_pred = xg_reg.predict(X_val)
mse = mean_squared_error(y_val, val_pred)
print(mse)
# 176.54

In [None]:
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

In [None]:
# grid.fit(np.array(X_train_val), np.array(y_train_val))
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

In [None]:
joblib.dump(grid, "stackingCV.pkl")
eval_pred = grid.predict(np.array(X_test))

In [None]:
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

## CatBoostRegressor

In [None]:
# !pip install catboost

In [None]:
import catboost as cb

In [None]:
flights_train_validation = pd.read_csv("final4_train_val.csv", low_memory=False, index_col="id")
flights_test = pd.read_csv("final4_test.csv", low_memory=False, index_col="id")

In [None]:
X_train_val, y_train_val = flights_train_validation.drop("ARRIVAL_DELAY", axis=1), flights_train_validation["ARRIVAL_DELAY"]
X_test = flights_test

In [None]:
X_test.head()

In [None]:
cbr = cb.CatBoostRegressor(loss_function="RMSE")
cbr.fit(X_train_val, y_train_val)
cbr.predict(X_test)
eval_pred = cbr.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

In [None]:
joblib.dump(cbr, "catboost.pkl")

In [None]:
import matplotlib.pyplot as plt
ft_importances = pd.Series(cbr.feature_importances_, index=X_train_val.columns)
ft_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
ft_importances.sort_values(ascending=False)

In [None]:
# Hyperparameter tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

def objective(space):
    reg = cbr(n_estimators =space['n_estimators'], 
                           max_depth = int(space['max_depth']),
                           gamma = space['gamma'], 
                           reg_alpha = int(space['reg_alpha']),
                           min_child_weight=int(space['min_child_weight']),
                           colsample_bytree=int(space['colsample_bytree']))

    eval_set  = [(X_train, y_train), (X_val, y_val)]

    reg.fit(X_train, y_train, eval_set=eval_set, eval_metric = 'rmse',
            early_stopping_rounds=10,verbose=False)
    val_pred = reg.predict(X_val)
    mse = mean_squared_error(y_val, val_pred)
    return{'loss':mse, 'status': STATUS_OK }

trials = Trials()
best_hyperparams = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print(best_hyperparams)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
cb = cb(**best_hyperparams)
cb.fit(X_train,y_train)
val_pred = xg_reg.predict(X_val)
mse = mean_squared_error(y_val, val_pred)
print(mse)

## Stacking CV Regressor

In [None]:
# !pip install mlxtend  

In [None]:
# Stacking CV Regressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from mlxtend.regressor import StackingCVRegressor

In [None]:
flights_train_validation = pd.read_csv("final4_train_val.csv", low_memory=False, index_col="id")
flights_test = pd.read_csv("final4_test.csv", low_memory=False, index_col="id")
X_train_val, y_train_val = flights_train_validation.drop("ARRIVAL_DELAY", axis=1), flights_train_validation["ARRIVAL_DELAY"]
X_test = flights_test

In [None]:
X_test.head()

In [None]:
ada = AdaBoostRegressor()
ada.fit(X_train_val, y_train_val)
joblib.dump(ada, "adaboost.pkl")

In [None]:
eval_pred = ada.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

In [None]:
xgboost = xgb.XGBRegressor()
xgboost.fit(X_train_val, y_train_val)
joblib.dump(xgboost, "xgboost.pkl")
eval_pred = xgboost.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

In [None]:
catboost = joblib.load("catboost.pkl")
xgboost = joblib.load("xgboost.pkl")

In [None]:
stack = StackingCVRegressor(regressors=(xgboost, catboost, ridge, lasso), meta_regressor=catboost, use_features_in_secondary=True)
stack.fit(X_train_val, y_train_val)

In [None]:
joblib.dump(stack, "stackingcv.pkl")

In [None]:
eval_pred = stack.predict(X_test)
pd.DataFrame(eval_pred, columns=['ARRIVAL_DELAY']).to_csv("flight_result.csv", index_label='id')

## Calculating CV errors

In [None]:
catboost = joblib.load("catboost.pkl")

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cv = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(catboost, X_train_val, y_train_val, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)
print(scores)