## 1.Import Libraries

In [4]:
!pip install xgboost==1.7.6

Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m31.3 MB/s[0m  [33m0:00:06[0mm0:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-1.7.6


In [2]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.9.3-py3-none-any.whl.metadata (10 kB)
Downloading feature_engine-1.9.3-py3-none-any.whl (229 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.9.3


In [1]:
import os
import boto3
import pickle
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    StandardScaler,
    OrdinalEncoder
)
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
import warnings
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)
import xgboost as xgb
print(xgb.__version__)  # Should show ~1.7.x


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
1.7.6


## 2.Display Settings

In [2]:
pd.set_option("display.max_columns",None)

In [3]:
sklearn.set_config(transform_output="pandas")

In [4]:
warnings.filterwarnings("ignore")

## 3.Read Datasets

In [5]:
train=pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-09,Delhi,Cochin,19:45:00,18:50:00,1385,1.0,No Info,16289
1,Air India,2019-03-06,Mumbai,Hyderabad,17:20:00,14:50:00,150,2.0,No Info,11972
2,Indigo,2019-06-09,Delhi,Cochin,11:25:00,21:00:00,575,1.0,No Info,6416
3,Indigo,2019-05-01,Banglore,Delhi,23:30:00,02:20:00,170,0.0,No Info,4591
4,Jet Airways,2019-06-03,Delhi,Cochin,17:30:00,04:25:00,655,1.0,No Info,14714
...,...,...,...,...,...,...,...,...,...,...
635,Indigo,2019-06-01,Delhi,Cochin,14:25:00,17:40:00,195,0.0,No Info,6015
636,Vistara,2019-05-01,Banglore,Delhi,11:30:00,14:20:00,170,0.0,No Info,4668
637,Spicejet,2019-05-18,Chennai,Kolkata,08:20:00,10:35:00,135,0.0,No check-in baggage included,3543
638,Indigo,2019-06-15,Banglore,Delhi,13:00:00,15:50:00,170,0.0,No Info,3943


In [6]:
val=pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Banglore,Delhi,15:15:00,18:10:00,175,0.0,No Info,7229
1,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,01:30:00,750,1.0,No Info,7198
2,Air India,2019-06-27,Delhi,Cochin,07:00:00,19:15:00,735,1.0,No Info,8669
3,Multiple Carriers,2019-06-12,Delhi,Cochin,10:00:00,01:30:00,930,1.0,No Info,7198
4,Vistara,2019-05-15,Kolkata,Banglore,07:10:00,22:40:00,930,1.0,No Info,8452
...,...,...,...,...,...,...,...,...,...,...
155,Indigo,2019-05-09,Delhi,Cochin,21:50:00,03:35:00,345,1.0,No Info,6058
156,Jet Airways,2019-05-21,Kolkata,Banglore,14:05:00,23:35:00,570,1.0,No Info,14781
157,Multiple Carriers,2019-06-06,Delhi,Cochin,17:00:00,01:30:00,510,1.0,No Info,7198
158,Spicejet,2019-05-18,Delhi,Cochin,17:55:00,22:50:00,295,1.0,No Info,5975


In [7]:
test=pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-06,Delhi,Cochin,19:30:00,04:25:00,535,2.0,No Info,13014
1,Multiple Carriers,2019-06-01,Delhi,Cochin,09:15:00,19:15:00,600,1.0,No Info,9779
2,Multiple Carriers,2019-06-15,Delhi,Cochin,07:10:00,16:10:00,540,1.0,In-flight meal not included,6093
3,Jet Airways,2019-05-18,Delhi,Cochin,07:05:00,12:35:00,330,1.0,In-flight meal not included,12373
4,Spicejet,2019-06-27,Banglore,Delhi,21:10:00,00:05:00,175,0.0,No check-in baggage included,4319
...,...,...,...,...,...,...,...,...,...,...
195,Multiple Carriers,2019-03-06,Delhi,Cochin,07:10:00,16:10:00,540,1.0,In-flight meal not included,7563
196,Jet Airways,2019-03-18,Banglore,New Delhi,05:45:00,13:15:00,450,1.0,No Info,12284
197,Indigo,2019-06-06,Delhi,Cochin,02:00:00,07:45:00,345,1.0,No Info,6386
198,Jet Airways,2019-06-06,Kolkata,Banglore,14:05:00,23:35:00,570,1.0,No Info,14571


## 4.Preprocessing Operations

In [8]:
# airline
air_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])

# doj
feature_to_extract=["month","week","day_of_week","day_of_year"]
doj_transformer=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True,format="mixed")),
    ("scaler",MinMaxScaler())
])

# source & destination
location_pipe1=Pipeline(steps=[
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="Other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])
def is_north(X):
    columns=X.columns.to_list()
    north_cities=["Delhi","Kolkata","Mumbai","New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north":X.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )
location_transformer=FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1=Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])
def part_of_day(X,morning=4,noon=12,eve=16,night=20):
    columns=X.columns.to_list()
    X_temp=X.assign(**{
        col:pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning,noon,inclusive="left"),
                 X_temp.loc[:,col].between(noon,eve,inclusive="left"),
                 X_temp.loc[:,col].between(eve,night,inclusive="left")
                ],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )
time_pipe2=Pipeline(steps=[
    ("part",FunctionTransformer(func=part_of_day)),
    ("encoder",CountFrequencyEncoder()),
    ("scaler",MinMaxScaler())
])
time_transformer=FeatureUnion(transformer_list=[
    ("part1",time_pipe1),
    ("part2",time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None,percentiles=[0.25,0.5,0.75],gamma=0.1):
        self.variables=variables
        self.percentiles=percentiles
        self.gamma=gamma
    def fit(self,X,y=None):
        if not self.variables:
            self.variables=X.select_dtypes(include="number").columns.to_list()
        self.reference_values_={
            col:(
                X
                .loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
            )
            for col in self.variables
        }
        return self
    def transform(self,X):
        objects=[]
        for col in self.variables:
            columns=[f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj=pd.DataFrame(
                data=rbf_kernel(X.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)
def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)
def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)
duration_pipe1=Pipeline(steps=[
    ("rbf",RBFPercentileSimilarity()),
    ("scaler",PowerTransformer())
])
duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union=FeatureUnion(transformer_list=[
    ("part1",duration_pipe1),
    ("part2",duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])
duration_transformer=Pipeline(steps=[
    ("outliers",Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer",SimpleImputer(strategy="median")),
    ("union",duration_union)
])

#total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))
total_stops_transformer=Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("",FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

# column transformer
column_transformer=ColumnTransformer(transformers=[
    ("air",air_transformer,["airline"]),
    ("doj",doj_transformer,["date_of_journey"]),
    ("location",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("dur",duration_transformer,["duration"]),
    ("stops",total_stops_transformer,["total_stops"]),
    ("info",info_transformer,["additional_info"])
],remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])


In [9]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

0,1,2
,steps,"[('ct', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('air', ...), ('doj', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function is_...x7f3010a5b250>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,False
,utc,
,format,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,func,<function par...x7f3010a5a290>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,1.5
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,variables,
,percentiles,"[0.25, 0.5, ...]"
,gamma,0.1

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,func,<function dur...x7f3010a5b2e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function is_...x7f3010a5b6d0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function is_...x7f3010a5b760>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function hav...x7f3010a5b7f0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,estimator,RandomForestR...ndom_state=42)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Jet Airways,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_year,dur__duration_cat,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.000000,0.058824,0.067797,2.0,1.633814,1.0,0
1,0.0,0.000000,0.058824,0.042373,0.0,-0.926116,2.0,0
2,0.0,1.000000,0.823529,0.847458,2.0,-0.045169,1.0,0
3,0.0,0.666667,0.529412,0.516949,0.0,-0.884660,0.0,1
4,1.0,1.000000,0.823529,0.796610,2.0,0.120657,1.0,0
...,...,...,...,...,...,...,...,...
635,0.0,1.000000,0.764706,0.779661,1.0,-0.832840,0.0,1
636,0.0,0.666667,0.529412,0.516949,0.0,-0.884660,0.0,1
637,0.0,0.666667,0.647059,0.661017,0.0,-0.957209,0.0,1
638,0.0,1.000000,0.882353,0.898305,0.0,-0.884660,0.0,1


## 4.Preprocess Data and Upload to Bucket

In [11]:
BUCKET_NAME="sagemaker-flights-data-5400"

DATA_PREFIX="data"

In [13]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [14]:
def export_data(data,name,pre):
    #split data into X and y subsets
    X=data.drop(columns="price")
    y=data.price.copy()

    # transformation
    X_pre=pre.transform(X)

    #exporting
    file_name=get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name,index=False,header=False)
    )
    

In [15]:
def upload_to_bucket(name):
    file_name=get_file_name(name)
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX,f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [16]:
def export_and_upload_bucket(data,name,pre):
    export_data(data,name,pre)
    upload_to_bucket(name)

In [17]:
export_and_upload_bucket(train,"train",preprocessor)

In [18]:
export_and_upload_bucket(val,"val",preprocessor)

In [19]:
export_and_upload_bucket(test,"test",preprocessor)

## 5.Model and Hyperparameter Tuning Set-up

In [20]:
session=sagemaker.Session()
region_name=session.boto_region_name

In [21]:
output_path=f"s3://{BUCKET_NAME}/model/output"

In [22]:
model=Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost",region_name,"1.7-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [23]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [24]:
hyperparameter_ranges={
    "eta":ContinuousParameter(0.05,0.2),
    "alpha":ContinuousParameter(0,1),
    "max_depth":IntegerParameter(3,5)
}

In [25]:
tuner=HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6.Data Channels

In [26]:
def get_data_channel(name):
    bucket_path=f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path,content_type="csv")

In [27]:
train_data_channel=get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f2ff62014e0>

In [28]:
val_data_channel=get_data_channel("val")

In [29]:
data_channels={
    "train":train_data_channel,
    "validation":val_data_channel
}

## 7.Train and Tune the Model

In [30]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...................................!


In [40]:
# tuner.best_estimator().deploy() ---->use this to deploy your model on sagemaker

## 8.Model Evaluation

In [31]:
best_model = xgb.Booster()
best_model.load_model("xgboost-model")

best_model




<xgboost.core.Booster at 0x7f2ff499f520>

In [63]:
def evaluate_model(name):
    file_name=get_file_name(name)
    data=pd.read_csv(file_name)

    X=xgb.DMatrix(data.iloc[:,1:])
    y=data.iloc[:,0].copy()
    pred=best_model.predict(X,validate_features=False)
    return r2_score(y,pred)

In [60]:
best_model.feature_names=None

In [64]:
evaluate_model("train")

0.0408138632774353

In [65]:
evaluate_model("val")

-0.04754304885864258

In [66]:
evaluate_model("test")

-0.05664968490600586