In [6]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv("../Data/flight_dep_tree.csv")
df.shape
df.head(5)

Unnamed: 0,MONTH,DAY_OF_MONTH_sin,DAY_OF_MONTH_cos,DAY_OF_WEEK_sin,DAY_OF_WEEK_cos,CRS_ELAPSED_TIME,DISTANCE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyRelativeHumidity,HourlyVisibility,HourlyWindSpeed,CRS_DEP_TIME_sin,CRS_DEP_TIME_cos,OP_UNIQUE_CARRIER,ORIGIN,DEST,DEP_DELAY
0,4,0.201299,0.97953,0.974928,-0.222521,168.0,1020.0,33.0,36.0,89.0,5.0,10.0,0.990389,-0.138309,0,1,62,-5.0
1,4,0.201299,0.97953,0.974928,-0.222521,160.0,1020.0,33.0,45.0,63.0,10.0,10.0,-0.785317,-0.619094,0,1,62,15.0
2,4,0.201299,0.97953,0.974928,-0.222521,299.0,2279.0,37.0,43.0,80.0,10.0,6.0,0.939991,-0.3412,0,4,14,-3.0
3,4,0.201299,0.97953,0.974928,-0.222521,300.0,2279.0,41.0,49.0,74.0,10.0,6.0,-0.460974,-0.887413,0,4,14,-3.0
4,4,0.201299,0.97953,0.974928,-0.222521,300.0,2279.0,41.0,45.0,86.0,10.0,7.0,-0.5,0.866025,0,4,14,-6.0


In [8]:
X = df.drop(columns=["DEP_DELAY"])
y = df["DEP_DELAY"].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (36124, 17), Test set: (9031, 17)


In [10]:
param_grid = {
    'n_estimators': [300, 500],
    'learning_rate': [0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=XGBRegressor(
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ),
    param_grid=param_grid,
    cv=3,                 
    scoring='r2',          
    n_jobs=-1,             
    verbose=2
)

In [11]:
print("\n Searching for the best hyperparameters for XGBoost using cross-validation (cv=3)")
grid_xgb.fit(X_train, y_train)

print("Best Parameters:", grid_xgb.best_params_)
print(f"Best CV R² Score: {grid_xgb.best_score_:.4f}")


 Searching for the best hyperparameters for XGBoost using cross-validation (cv=3)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.8}
Best CV R² Score: 0.1777


In [14]:
best_xgb = grid_xgb.best_estimator_
y_pred = best_xgb.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n XGBoost Performance:")
print(f"MAE   = {mae:.3f}")
print(f"RMSE  = {rmse:.3f}")

print("Predictions:", y_pred[:11])
print("Actuals:    ", y_test[:11])


 XGBoost Performance:
MAE   = 6.975
RMSE  = 9.479
Predictions: [-4.1887813 -2.5609708 -7.267175   6.8950973  4.7411766  3.5760415
 -2.2124557 -1.5085824  1.4044187  0.3992294  1.1024661]
Actuals:     [  3.  -8.  -9.   5.   7.  -4. -10.  -6.   0.  -2.  29.]


In [13]:
print("\n Optimal model parameters:")
print(best_xgb.get_params())


 Optimal model parameters:
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'feature_weights': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.05, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 500, 'n_jobs': -1, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0}
