In [1]:
from pipeline_helpers import *
### Parameters for hyperparameter tuning

rf_params = {"max_depth": [3,5,8, None], "max_features": [3,5,7], "n_estimators": [100,200,500,1000], "min_samples_split": [2,5,10]}
gbm_params = {"learning_rate": [0.001, 0.01, 0.1, 0.05], "n_estimators": [100,200,500,1000], "max_depth": [3,5,8, None]}
et_params = {"max_depth": [3,5,8, None], "max_features": [3,5,7], "n_estimators": [100,200,500,1000], "min_samples_split": [2,5,10]}

### Regressors that will be used in the ensemble model

regressors = [('RF', RandomForestRegressor(), rf_params),
             ('GBM', GradientBoostingRegressor(), gbm_params),
             ('ET', ExtraTreesRegressor(), et_params)]


In [2]:
def main():
    df = pd.read_excel("Data_Train.xlsx")
    X, y = flight_data_prep(df)
    base_models(X, y)
    best_models = hyperparameter_optimization(X, y)
    voting_reg = voting_regressor(X=X, y=y, best_models=best_models)
    joblib.dump(voting_reg, "voting_reg.pkl")
    return voting_reg

In [3]:
if __name__ == "__main__":
    print("Process Started")
    main()

Process Started
Observations: 10683
Variables: 11
cat_cols: 5
num_cols: 1
cat_but_car: 5
num_but_cat: 0
Base Models:
r2: 0.5814 (LR
r2: 0.6086 (KNN
r2: 0.1898 (SVR
r2: 0.6985 (CART
r2: 0.8055 (RF
r2: 0.6423 (AdaBoost
r2: 0.7748 (ET
r2: 0.6275 (AB
r2: 0.7693 (GBM
Hyperparameter Optimization
############# RF#############
r2 (Before): 0.8051
r2 (After): 0.8378
RF best params: {'max_depth': None, 'max_features': 7, 'min_samples_split': 10, 'n_estimators': 1000} 


############# GBM#############
r2 (Before): 0.7693
r2 (After): 0.8371
GBM best params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000} 


############# ET#############
r2 (Before): 0.7745
r2 (After): 0.8242
ET best params: {'max_depth': None, 'max_features': 7, 'min_samples_split': 10, 'n_estimators': 1000} 


Voting Regressor
Voting Regressor Scores 
 2922748.364242098 0.841092415041534 1144.417883184342


In [4]:
model = joblib.load("voting_reg.pkl")

In [5]:
df = pd.read_excel("Data_Train.xlsx")
X, y = flight_data_prep(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Observations: 10683
Variables: 11
cat_cols: 5
num_cols: 1
cat_but_car: 5
num_but_cat: 0


In [6]:
y_pred = model.predict(X_test)

In [7]:
y_pred = np.array(y_pred)

In [8]:
y_test = np.array(y_test)

In [10]:
mean_absolute_error(y_test, y_pred)

864.9101734423405

In [12]:
mean_absolute_percentage_error(y_test, y_pred)

0.09667413892624654