# 1. Import Libraries

In [80]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [3]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.8.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.1/357.1 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.8.0


In [83]:
import os 

import boto3

import pickle 

import numpy as np, pandas as pd

import xgboost as xgb

import sklearn

from sklearn.impute import SimpleImputer

from sklearn.base import (
    BaseEstimator,
    TransformerMixin
)

from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import rbf_kernel

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import (
    Pipeline, 
    FeatureUnion
)
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder, 
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance

from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    StandardScaler,
    PowerTransformer,
    FunctionTransformer
)

from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)


# 2. Display Settings

In [5]:
pd.set_option("display.max_columns", None)

sklearn.set_config(transform_output = "pandas") # by default sklearn gives numpy array, changing it give pandas dataframe

In [20]:


def apply_custom_css():
    from IPython.core.display import HTML
    custom_css = """
    <style>
    /* Reduce padding around code cells */
    div.input_area {
        padding: 0.1em;
        border: 1px solid #cccccc;
        border-radius: 5px;
    }

    /* Reduce padding around markdown cells */
    div.text_cell_render {
        padding: 0.1em;
        border: 1px solid #cccccc;
        border-radius: 5px;
    }

    /* Reduce padding around output cells */
    div.output_area {
        padding: 0.1em;
        border: 1px solid #cccccc;
        border-radius: 5px;
    }

    /* Adjust margins and padding for overall notebook layout */
    div#notebook-container {
        width: 99%;
        margin-left: auto;
        margin-right: auto;
        padding: 0;
    }

    /* Remove unnecessary padding around header */
    div#header-container {
        padding: 0.1em;
        border-bottom: 1px solid #cccccc;
    }

    /* Remove extra padding and margin around cells */
    div.cell {
        margin: 0.2em 0;
        padding: 0.2em;
    }

    /* Remove extra padding around the main container */
    div#site {
        padding: 0;
    }

    /* Reduce size of input and output prompt areas */
    div.prompt {
        min-width: 2em;
        padding: 0.1em;
        font-size: 0.75em;
    }
    </style>
    """
    display(HTML(custom_css))

# Apply the custom CSS
apply_custom_css()

# 3. Reading the data

In [6]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-09-03,Banglore,New Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-12-06,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-12-03,Banglore,New Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6691,Air India,2019-01-05,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6692,Jet Airways,2019-01-06,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6693,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [8]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Delhi,Cochin,20:25:00,01:30:00,305,1.0,No Info,5054
1,Multiple Carriers,2019-12-06,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,9646
2,Jet Airways,2019-12-03,Banglore,New Delhi,22:55:00,15:15:00,980,1.0,In-flight meal not included,11087
3,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587
4,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,04:25:00,1760,2.0,No Info,16704
...,...,...,...,...,...,...,...,...,...,...
1669,Spicejet,2019-01-05,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
1670,Indigo,2019-01-05,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5069
1671,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35:00,425,2.0,In-flight meal not included,15544
1672,Jet Airways,2019-12-06,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,3210


In [9]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-03,Banglore,New Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Indigo,2019-06-27,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
4,Indigo,2019-06-05,Kolkata,Banglore,15:15:00,17:45:00,150,0.0,No Info,4804
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Multiple Carriers,2019-06-27,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7155
2090,Jet Airways,2019-03-06,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


# 3. Preprocessing Operations

In [14]:
# airline

air_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("grouper", RareLabelEncoder(tol= 0.1, replace_with= "Other", n_categories = 2)),
    ("encoder", OneHotEncoder(sparse_output= False, handle_unknown= "ignore"))
])

# air_transformer.fit_transform(X_train.loc[:, ["airline"]])#.airline.value_counts()

# date_of_journey

feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_tranformer = Pipeline(steps= [
    ("dt", DatetimeFeatures(features_to_extract = feature_to_extract, yearfirst= True, format= "mixed")),
    ("scaler", MinMaxScaler())


])

# doj_tranformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

# source & destination

# location_subset = X_train.loc[:, ['source', 'destination']]

location_pipe1 = Pipeline(steps = [
    ("grouper", RareLabelEncoder (tol= 0.1, replace_with= "Other", n_categories = 2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
    

])

# location_pipe1.fit_transform(location_subset, y_train)

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:,col].isin(north_cities).astype(int)
            for col in columns
            
        })
        .drop(columns = columns)

    )

# FunctionTransformer(func= is_north).fit_transform(location_subset)

location_transformer = FeatureUnion(transformer_list = [
    ("part1", location_pipe1), 
    ("part2", FunctionTransformer(func= is_north))

])

# location_transformer.fit_transform(location_subset, y_train)

# dep_time and arrival_time

# time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]

time_pipe1 = Pipeline(steps = [
    ("dt", DatetimeFeatures(features_to_extract = ["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

# time_pipe1.fit_transform(time_subset)

def part_of_day(X, morning = 4, noon = 12, eve = 16, night = 20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_data": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive = "left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive = "left"),
                 X_temp.loc[:, col].between(eve, night, inclusive = "left")
                ],
                ["morning", "afternoon", "evening"],
                default = 'night'
            )
            for col in columns
        })
        .drop(columns = columns)
    )

# FunctionTransformer(func = part_of_day).fit_transform(time_subset)

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func= part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
    
])

time_transformer = FeatureUnion(transformer_list = [
    ("part1", time_pipe1),
    ("part2", time_pipe2),
    
])

# time_transformer.fit_transform(time_subset)

# duration

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables = None, percentiles = [0.25, 0.5, 0.75], gamma = 0.1):
        self.variables = variables
        self .percentiles = percentiles
        self.gamma = gamma
        
        
    def fit(self, X, y = None):
        if not self.variables:
            self.variables = X.select_dtypes(include = "number").columns.to_list()
            
        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
            
        }
        
        return self
    
    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data = rbf_kernel(X.loc[:, [col]], Y = self.reference_values_[col], gamma = self.gamma),
                columns = columns
            )
            objects.append(obj)
            
        return pd.concat(objects, axis = 1)
    
    

# RBFPercentileSimilarity().fit_transform(X_train.loc[:, ["duration"]])

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                        X.duration.between(short, med, inclusive="left")],
                                       ["short", "medium"],
                                       default="long"))
        .drop(columns="duration")
    )



def is_over(X, value=1000):
    return (
        X
        .assign(**{
        f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps = [
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps = [
    ("cat", FunctionTransformer(func = duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])
duration_union = FeatureUnion(transformer_list= [
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
     ("part3", FunctionTransformer(func = is_over)),
     ("part4", StandardScaler())
    
])

duration_transformer = Pipeline(steps = [
    ("outlier", Winsorizer(capping_method="iqr", fold= 1.5)),
    ("imputer", SimpleImputer(strategy= "median")),
    ("union", duration_union)
    
])

# duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

# duration_union.fit_transform(X_train.loc[:, ["duration"]])

# total_stops

def is_direct(X):
    return (
        X
        .assign(
            is_direct_flight = (
                X
                .total_stops
                .eq(0)
                .astype(int)
            )
        )
    )

total_stops_transformer = Pipeline(steps= [
    ("imputer", SimpleImputer(strategy= "most_frequent")),
    ("", FunctionTransformer(func= is_direct))
])

# total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

# additional_info

info_pipe1 = Pipeline(steps= [
    ("group", RareLabelEncoder(tol= 0.1, n_categories = 2, replace_with= "Other")),
    ("encoder", OneHotEncoder(handle_unknown= "ignore", sparse_output= False))
])
# info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

def have_info(X):
    return (
        X
        .assign(
            additional_info = (
                X
                .additional_info
                .ne("No Info")
                .astype(int)
            )
        )
    )

info_union = FeatureUnion(transformer_list= [
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func= have_info))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy= "constant", fill_value= "unknown")),
    ("union", info_union)
])

# info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

# Column Transformer

column_tranformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_tranformer, ["date_of_journey"]),
    ("location", location_transformer, ["source", "destination"]),
    ("time", time_transformer, ["dep_time", "arrival_time"]),
    ("dur", duration_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder= "passthrough")

# column_tranformer.fit_transform(X_train, y_train)

# feature selection

estimator = RandomForestRegressor(n_estimators= 10, max_depth= 3, random_state= 42)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring= "r2",
    threshold= 0.1

)

# preprocessor 

preprocessor = Pipeline(steps= [
    ("ct", column_tranformer),
    ("selector", selector)
])

# 

In [17]:
preprocessor.fit(
    train.drop(columns = "price"),
    train.price.copy()

)

In [18]:
preprocessor.transform(train.drop(columns = "price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.228487,-0.857930,-0.736484,-0.364262,2.0,0,-0.033916,1.0,0
1,0.0,1.0,0.0,0.246291,1.065418,1.061694,-0.364262,2.0,0,0.046422,1.0,0
2,0.0,0.0,1.0,0.721068,-0.857930,-0.736484,2.373008,0.0,0,-0.917631,0.0,1
3,0.0,0.0,0.0,1.000000,-0.203928,-0.224351,-0.364262,2.0,0,-0.174507,1.0,0
4,0.0,1.0,0.0,0.991098,-0.857930,-0.736484,-0.364262,2.0,0,-0.214676,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.228487,1.065418,1.061694,-0.364262,2.0,1,2.597145,2.0,0
6691,0.0,0.0,0.0,0.005935,-0.203928,-0.224351,-0.364262,2.0,0,-0.174507,1.0,0
6692,0.0,1.0,0.0,0.008902,1.065418,1.061694,-0.364262,1.0,0,-0.666576,1.0,0
6693,0.0,0.0,1.0,0.510386,1.065418,1.061694,-0.364262,1.0,0,-0.606322,1.0,0


# 4. Preprocess Data and Upload to Bucket

In [19]:
BUCKET_NAME = "sagemaker-flights-buckets-abhishek"

DATA_PREFIX = "data"



In [21]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [29]:
def export_data(data, name, pre):
    # split data into X and y subset
    X = data.drop(columns = "price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index = False, header = False)
    )
    

In [37]:
def upload_to_bucket(name):
    
    file_name = get_file_name(name)
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [38]:
def export_and_upload_bucket(data, name, pre):
    
    export_data(data, name, pre)
    
    upload_to_bucket(name)

In [39]:
export_and_upload_bucket(train, "train", preprocessor)

In [41]:
export_and_upload_bucket(val, "val", preprocessor)

In [42]:
export_and_upload_bucket(test, "test", preprocessor)

# 5. Model and Hyperparameter Tuning Set-Up

In [45]:
session = sagemaker.Session()

region_name = session.boto_region_name

In [46]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [67]:
# basic model setup 

model = Estimator(
    image_uri = sagemaker.image_uris.retrieve("xgboost", region_name , "1.2-1"),
    role = sagemaker.get_execution_role(),
    instance_count= 1,
    instance_type= "ml.m5.xlarge", # type of machine
    volume_size=5,# free bucket size
    output_path=output_path,
    use_spot_instances= True,
    max_run= 100,
    max_wait=600, # prevents from training indefinitely
    sagemaker_session=session
    
)

In [68]:
model.set_hyperparameters(
    objective = "reg:linear", # mean squared error
    num_round = 10, # number of base estimators 
    eta = 0.1, # learning rate 
    max_depth = 5, # depth of tree
    subsample = 0.8, # for each tree randomly sample 80% of the rows
    colsample_bytree = 0.8, # for each tree randomly sample 80% of the cols
    alpha = 0.1 # l2 regularization
)

In [69]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0,1),
    "max_depth": IntegerParameter(3, 5)
}

In [70]:
tuner = HyperparameterTuner(
    estimator = model, 
    objective_metric_name= "validation:rmse",
    hyperparameter_ranges= hyperparameter_ranges,
    strategy= "Bayesian",
    objective_type= "Minimize"
)

# 6. Data Channels

In [71]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type= "csv")

In [72]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f2ee17f5f30>

In [76]:
val_data_channel = get_data_channel("val")
val_data_channel

<sagemaker.inputs.TrainingInput at 0x7f2ee1891fc0>

In [77]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

# 7. Train and Tune the Model 

In [78]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...........................!


In [79]:
# deployment of model on sagemaker

# tuner.best_estimator().deploy()

# 8. Model Evaluation

In [82]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f2edbfd2c20>

In [84]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [85]:
evaluate_model("train")

0.5803811550140381

In [86]:
evaluate_model("val")

0.5153300762176514

In [87]:
evaluate_model("test")

0.5571074485778809