In [54]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    FunctionTransformer,
    PowerTransformer,
    StandardScaler,
    OrdinalEncoder
)
import warnings
from category_encoders import CountEncoder
from sklearn.compose import ColumnTransformer
from feature_engine.encoding import RareLabelEncoder,MeanEncoder,CountFrequencyEncoder
from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import optuna


## 2.Display Settings


In [2]:
pd.set_option('display.max_columns',None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings('ignore')

## 3.Read Data

In [3]:
train_path=r"C:\Users\arpit\Desktop\Flight SageMaker\data\train.csv"
test_path=r"C:\Users\arpit\Desktop\Flight SageMaker\data\test.csv"
val_path=r"C:\Users\arpit\Desktop\Flight SageMaker\data\val.csv"

train=pd.read_csv(train_path)
test=pd.read_csv(test_path)
val=pd.read_csv(val_path)

In [4]:
print(train.shape,val.shape,test.shape)

(8369, 10) (2093, 10) (2093, 10)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8369 entries, 0 to 8368
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          8369 non-null   object 
 1   date_of_journey  8369 non-null   object 
 2   source           8369 non-null   object 
 3   destination      8369 non-null   object 
 4   dep_time         8369 non-null   object 
 5   arrival_time     8369 non-null   object 
 6   duration         8369 non-null   int64  
 7   total_stops      8368 non-null   float64
 8   additional_info  8369 non-null   object 
 9   price            8369 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 654.0+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093 entries, 0 to 2092
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          2093 non-null   object 
 1   date_of_journey  2093 non-null   object 
 2   source           2093 non-null   object 
 3   destination      2093 non-null   object 
 4   dep_time         2093 non-null   object 
 5   arrival_time     2093 non-null   object 
 6   duration         2093 non-null   int64  
 7   total_stops      2093 non-null   float64
 8   additional_info  2093 non-null   object 
 9   price            2093 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 163.6+ KB


In [7]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093 entries, 0 to 2092
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          2093 non-null   object 
 1   date_of_journey  2093 non-null   object 
 2   source           2093 non-null   object 
 3   destination      2093 non-null   object 
 4   dep_time         2093 non-null   object 
 5   arrival_time     2093 non-null   object 
 6   duration         2093 non-null   int64  
 7   total_stops      2093 non-null   float64
 8   additional_info  2093 non-null   object 
 9   price            2093 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 163.6+ KB


In [8]:
x_train=train.drop(columns='price')
y_train=train['price']
x_train.shape,y_train.shape

((8369, 9), (8369,))

In [9]:
x_test=test.drop(columns='price')
y_test=test['price']
x_test.shape,y_test.shape

((2093, 9), (2093,))

In [10]:
x_val=val.drop(columns='price')
y_val=val['price']
x_val.shape,y_val.shape

((2093, 9), (2093,))

## 4.Transformation operations

In [11]:
## airline
air_transformer=Pipeline(
    steps=[
    ('SimpleImputer',SimpleImputer(strategy='most_frequent')),
    ('RareLabelEncoder',RareLabelEncoder(tol=0.1,replace_with='Others',n_categories=2)),
    ('OneHotEncoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])

## date of journey
features_to_extract=['month','week','day_of_week','day_of_year']
doj_transformer=Pipeline(steps=[
    ('doj_transformer',DatetimeFeatures(features_to_extract=features_to_extract,yearfirst=True,format='mixed')),
    ('scaler',MinMaxScaler())
]
)

#source/destination
loc_pipe1=Pipeline([
    ('grouper',RareLabelEncoder(tol=0.1,replace_with='others',n_categories=2)),
    ('mean_encoder',MeanEncoder()),
    ('powertransfomer',PowerTransformer())
    
])

def is_north(df):
    columns=df.columns.tolist()
    north_cities=['Delhi','Kolkata','Mumbai','New Delhi']
    return (
        df.assign(**{
            f"{col}_is_north":df.loc[:,col].isin(north_cities).astype('int')
            for col in columns
        }).drop(columns=columns)
    )

location_transformer=FeatureUnion(transformer_list=[
    ('loc_pipe1',loc_pipe1),
    ('loc_pipe2',FunctionTransformer(func=is_north))
])

#dep time/arrival time
time_pipe1=Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler',MinMaxScaler())
])

def part_of_day(df,morning=4,afternoon=12,evening=16,night=20):
    columns=df.columns.tolist()
    X_temp=df.assign(**{
        col:pd.to_datetime(df.loc[:,col]).dt.hour
        for col in columns
    })
    
    return X_temp.assign(**{
        f"{col}_part_of_day":np.select([
            X_temp.loc[:,col].between(morning,afternoon,inclusive='left'),
            X_temp.loc[:,col].between(afternoon,evening,inclusive='left'),
            X_temp.loc[:,col].between(evening,night,inclusive='left')
        ],['morning','afternoon','evening'],default='night')
        for col in columns
    }).drop(columns=columns)

time_pipe2=Pipeline(steps=[
    ('part_of_day',FunctionTransformer(func=part_of_day)),
    ('count_encoder',CountFrequencyEncoder()),
    ('scaler',MinMaxScaler())
    
])

time_transformer=FeatureUnion(
    transformer_list=[
        ('time_pipe1',time_pipe1),
        ('time_pipe2',time_pipe2)
    ]
)

#duration
class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None,percentiles=[0.25,0.5,0.75],gamma=0.1):
        self.variables=variables
        self.percentiles=percentiles
        self.gamma=gamma
        
        
    def fit(self,X,y=None):
        if not self.variables:
            self.variables=X.select_dtypes(include='number').columns.tolist()
            
        self.reference_values_={
            col:(
                X.loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
               )
                for col in self.variables
            
        }
        
        return self
    
    def transform(self,X):
        objects=[]
        for col in self.variables:
            columns=[f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj=pd.DataFrame(
                data=rbf_kernel(X.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns=columns
            )
            
        objects.append(obj)
        
        return pd.concat(objects,axis=1)
    
def duration_category(df,short=180,medium=400):
    return (
       df.assign(
           duration_cat=np.select([df.duration.lt(short),df.duration.between(short,medium,inclusive='left')],
                                  ['short','medium'],
                                  default="long")).drop(columns='duration')
    )

def is_over(df, value=1000):
    return (
        df
        .assign(**{
         f"duration_over_{value}": df.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
       )
duration_pipe1 = Pipeline(steps=[
   ("rbf", RBFPercentileSimilarity()),
   ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
   ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
   ("imputer", SimpleImputer(strategy="median")),
   ("union", duration_union)
])


#total stops
def is_direct(X):
     return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))
    

#is_direct(x_train)

total_stops_transformer = Pipeline(steps=[
   ("imputer", SimpleImputer(strategy="most_frequent")),
   ("", FunctionTransformer(func=is_direct))
])
#additional info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])


#column transformer
column_transformer=ColumnTransformer([
    ('air_transfomer',air_transformer,['airline']),
    ('doj_transformer',doj_transformer,['date_of_journey']),
    ('location_transformer',location_transformer,['source','destination']),
    ('time_transformer',time_transformer,['dep_time','arrival_time']),
    ("duration_transformer", duration_transformer, ["duration"]),
    ("total_stops_transformer", total_stops_transformer, ["total_stops"]),
    ("info_transformer", info_transformer, ["additional_info"])
],remainder='passthrough',verbose_feature_names_out=False)

#column_transformer.fit_transform(x_train,y_train)

## 3.Selector

In [12]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring="r2",threshold=0.1
) 

## 4.Preprocessing


In [13]:
preprocessor = Pipeline(steps=[
    ("ct", column_transformer),
    ("selector", selector)
])

x_train_trf=preprocessor.fit_transform(x_train, y_train)
x_test_trf=preprocessor.transform(x_test)
x_val_trf=preprocessor.transform(x_val)

In [14]:
x_train_trf.shape,x_test_trf.shape,x_val_trf.shape

((8369, 13), (2093, 13), (2093, 13))

## 5.Model Training

In [63]:
def objective(trial):
    dtrain = xgb.DMatrix(x_train_trf, label=y_train)
    dvalid = xgb.DMatrix(x_val_trf, label=y_val)

    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    rmse = RMSE(y_val, pred_labels)
    return rmse


if __name__ == "__main__":
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-05-19 14:13:27,896] A new study created in memory with name: no-name-3704be26-3b2f-4749-9092-4c3c1df66248
[I 2024-05-19 14:13:27,999] Trial 0 finished with value: 4496.320084198855 and parameters: {'booster': 'gbtree', 'lambda': 0.07655179611101101, 'alpha': 1.9327069548313567e-06, 'subsample': 0.5037517370616182, 'colsample_bytree': 0.47809800045088224, 'max_depth': 5, 'min_child_weight': 9, 'eta': 4.820111415818835e-07, 'gamma': 1.160340280202669e-06, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 4496.320084198855.
[I 2024-05-19 14:13:28,087] Trial 1 finished with value: 2765.3593993801396 and parameters: {'booster': 'dart', 'lambda': 7.389772942730484e-07, 'alpha': 2.0117105191942696e-08, 'subsample': 0.7585293033714327, 'colsample_bytree': 0.3368154055063155, 'max_depth': 3, 'min_child_weight': 4, 'eta': 0.9234694420802826, 'gamma': 0.41933503250973675, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.6330231196152

[I 2024-05-19 14:13:31,047] Trial 17 finished with value: 4495.570861871066 and parameters: {'booster': 'dart', 'lambda': 0.0006093685858438665, 'alpha': 3.697940665524794e-05, 'subsample': 0.7668378882253508, 'colsample_bytree': 0.6054408831415405, 'max_depth': 5, 'min_child_weight': 8, 'eta': 2.5224765452040234e-05, 'gamma': 1.159221977645605e-08, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 1.3826611391150732e-05, 'skip_drop': 0.07535200756845625}. Best is trial 10 with value: 2187.934370551898.
[I 2024-05-19 14:13:31,312] Trial 18 finished with value: 2618.748603704214 and parameters: {'booster': 'gbtree', 'lambda': 1.6176070451606887e-06, 'alpha': 3.3296186229344256e-07, 'subsample': 0.9360363651414383, 'colsample_bytree': 0.8667780123836493, 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.10162016651607808, 'gamma': 2.110266437637212e-06, 'grow_policy': 'lossguide'}. Best is trial 10 with value: 2187.934370551898.
[I 2024-05-19 14:

[I 2024-05-19 14:13:35,821] Trial 34 finished with value: 4390.406845660801 and parameters: {'booster': 'dart', 'lambda': 0.00022590053059298422, 'alpha': 3.4095357612863575e-08, 'subsample': 0.53248538891505, 'colsample_bytree': 0.5760045424862456, 'max_depth': 9, 'min_child_weight': 5, 'eta': 0.0035448205995106106, 'gamma': 5.942415981346811e-05, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 7.980560374835705e-05, 'skip_drop': 0.18626963484609815}. Best is trial 10 with value: 2187.934370551898.
[I 2024-05-19 14:13:36,081] Trial 35 finished with value: 2282.520631004317 and parameters: {'booster': 'dart', 'lambda': 0.0015357172377505805, 'alpha': 1.011421872387403e-08, 'subsample': 0.8363268508875668, 'colsample_bytree': 0.812746421629884, 'max_depth': 5, 'min_child_weight': 8, 'eta': 0.3016566771608511, 'gamma': 9.696749311770864e-07, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 0.0015

Number of finished trials:  50
Best trial:
  Value: 2144.819020494194
  Params: 
    booster: dart
    lambda: 0.03055725813344619
    alpha: 0.0001915783377500749
    subsample: 0.7816118720235875
    colsample_bytree: 0.8987040027983191
    max_depth: 9
    min_child_weight: 9
    eta: 0.342002066254546
    gamma: 5.100733979095529e-07
    grow_policy: depthwise
    sample_type: uniform
    normalize_type: forest
    rate_drop: 0.00010345308237607177
    skip_drop: 0.7467517850031523


In [51]:
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(x_train_trf, y_train)

In [62]:
preds = best_model.predict(x_val_trf)
rmse = (mean_squared_error(y_val, preds, squared=False))
print(f"RMSE of the best model for validation: {rmse}")

RMSE of the best model for validation: 2309.158669924696


In [61]:
preds = best_model.predict(x_test_trf)
rmse = (mean_squared_error(y_test, preds, squared=False))
print(f"RMSE of the best model for test: {rmse}")

RMSE of the best model for test: 2309.158669924696
