## 1. Import Libraries

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [2]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.1/357.1 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.0


In [4]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

#Import required libraries from Sagemaker

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)


## 2. Display Settings

In [5]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [7]:
train = pd.read_csv("train_final.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-03-09,Banglore,Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-06-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-03-12,Banglore,Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6689,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6690,Air India,2019-05-01,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6691,Jet Airways,2019-06-01,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6692,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [8]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Indigo,2019-06-27,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
4,Indigo,2019-05-06,Kolkata,Banglore,15:15:00,17:45:00,150,0.0,No Info,4804
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Multiple Carriers,2019-06-27,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7155
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


In [9]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Delhi,Cochin,20:25:00,01:30:00,305,1.0,No Info,5054
1,Multiple Carriers,2019-06-12,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,9646
2,Jet Airways,2019-03-12,Banglore,Delhi,22:55:00,15:15:00,980,1.0,In-flight meal not included,11087
3,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587
4,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,04:25:00,1760,2.0,No Info,16704
...,...,...,...,...,...,...,...,...,...,...
1669,Spicejet,2019-05-01,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
1670,Indigo,2019-05-01,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5069
1671,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35:00,425,2.0,In-flight meal not included,15544
1672,Jet Airways,2019-06-12,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,3210


## 4. Preprocessing Operations

In [10]:
# airline
airline_transformer = Pipeline(steps=[
                    
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('grouper', RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))   
    
])


#doj

features_to_extract = ["month", "day_of_week", "week", "day_of_year"]
doj_transformer = Pipeline(steps=[
        ("dt", DatetimeFeatures(features_to_extract=features_to_extract, yearfirst=True)),
        ("scaler", MinMaxScaler())      
])

# source & destination

location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
    
])

def is_north(X):
	columns = X.columns.to_list()
	north_cities = ["Delhi", "Kolkota"]
	return (
		X
		.assign(**{
			f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
			for col in columns
		})
		.drop(columns=columns)
	)

location_transformer = FeatureUnion([
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
     ])

# dep_time & arrival_time

time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
    
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })              
    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day":np.select(
                [X_temp.loc[:,col].between(morning, noon, inclusive ="left"),
                 X_temp.loc[:,col].between(noon, eve, inclusive ="left"),
                 X_temp.loc[:,col].between(eve, night, inclusive ="left")],
                ["morning", "noon", "evening"],
                default = "night"
            )
            for col in columns
        })
        .drop(columns=columns)
        
    )
    
    

time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])


time_transformer = FeatureUnion([
    ("time_part1", time_pipe1),
    ("time_part2", time_pipe2)
])


# duration

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin): # 
	def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
		self.variables = variables
		self.percentiles = percentiles
		self.gamma = gamma


	def fit(self, X, y=None):
		if not self.variables:
			self.variables = X.select_dtypes(include="number").columns.to_list()

		self.reference_values_ = {
			col: (
				X
				.loc[:, col]
				.quantile(self.percentiles)
				.values
				.reshape(-1, 1)
			)
			for col in self.variables
		}

		return self


	def transform(self, X):
		objects = []
		for col in self.variables:
			columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
			obj = pd.DataFrame(
				data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
				columns=columns
			)
			objects.append(obj)
		return pd.concat(objects, axis=1)

def duration_category(X, short=180, med=400):
	return (
		X
		.assign(duration_cat=np.select([X.duration.lt(short),
									    X.duration.between(short, med, inclusive="left")],
									   ["short", "medium"],
									   default="long"))
		.drop(columns="duration")
	)

def is_over(X, value=1000):
	return (
		X
		.assign(**{
			f"duration_over_{value}": X.duration.ge(value).astype(int)
		})
		.drop(columns="duration")
	)

duration_pipe1 = Pipeline(steps=[
	("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_category)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])


# total_stops

def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

# additional_info

info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])


# column transformer

column_transformer = ColumnTransformer(transformers=[
	("air", airline_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector

estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

# preprocessor

preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])



In [11]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [12]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.176471,0.169492,-0.857629,-0.857629,-0.364291,2.0,0,-0.033600,1.0,0
1,0.0,1.0,0.0,0.235294,0.220339,1.065619,1.065619,-0.364291,2.0,0,0.046768,1.0,0
2,0.0,0.0,1.0,0.058824,0.067797,-0.857629,-0.857629,2.372693,0.0,0,-0.917646,0.0,1
3,0.0,0.0,0.0,0.882353,0.872881,-0.203923,-0.203923,-0.364291,2.0,0,-0.174244,1.0,0
4,0.0,1.0,0.0,0.117647,0.093220,-0.857629,-0.857629,-0.364291,2.0,0,-0.214428,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6689,0.0,1.0,0.0,0.176471,0.169492,1.065619,1.065619,-0.364291,2.0,1,2.598445,2.0,0
6690,0.0,0.0,0.0,0.529412,0.516949,-0.203923,-0.203923,-0.364291,2.0,0,-0.174244,1.0,0
6691,0.0,1.0,0.0,0.764706,0.779661,1.065619,1.065619,-0.364291,1.0,0,-0.666496,1.0,0
6692,0.0,0.0,1.0,1.000000,0.974576,1.065619,1.065619,-0.364291,1.0,0,-0.606221,1.0,0


## 4. Preprocess Data and Upload to Bucket

In [13]:
BUCKET_NAME = "sagemaker-flight-price-1"

DATA_PREFIX = "data"

In [14]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [16]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()    # For sagemaker target column should be first in data file
        .join(X_pre)
        .to_csv(file_name, index=False, header=False) # For Sagemaker hearder should be false
    )

In [17]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [18]:
def export_and_upload_bucket(data, name, pre): # data - dataframe, name - file name, pre- preprocessor
    export_data(data, name, pre)
    upload_to_bucket(name)


In [19]:
export_and_upload_bucket(train,"train",preprocessor) # Save train data in sagemaker format

In [20]:
export_and_upload_bucket(test, "test", preprocessor) # Save test data in sagemaker format

In [21]:
export_and_upload_bucket(val, "val", preprocessor)  # Save val data in sagemaker format

In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093 entries, 0 to 2092
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          2093 non-null   object 
 1   date_of_journey  2093 non-null   object 
 2   source           2093 non-null   object 
 3   destination      2093 non-null   object 
 4   dep_time         2093 non-null   object 
 5   arrival_time     2093 non-null   object 
 6   duration         2093 non-null   int64  
 7   total_stops      2093 non-null   float64
 8   additional_info  2093 non-null   object 
 9   price            2093 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 163.6+ KB


## 5. Model and Hyperparameter Tuning Set-up

In [26]:
# To connect to sagemaker, set session and region
session = sagemaker.Session()



In [27]:
region_name = session.boto_region_name
region_name

'ap-southeast-2'

In [28]:
# Set path for model output storage
output_path = f"s3://{BUCKET_NAME}/model/output"

In [30]:
# Define Model

model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [None]:
sagemaker.image_uris.retrieve()

In [29]:
sagemaker.get_execution_role()

'arn:aws:iam::211125557087:role/flight-sagemaker-role'

In [31]:
model

<sagemaker.estimator.Estimator at 0x7efcd42d6e60>

In [32]:
model.set_hyperparameters(
    objective="reg:linear",  # MSE
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,  # random sample 80% of rows to avoid over-fitting
    colsample_bytree=0.8,   # random sample 80% of columns to avoid over-fitting
    alpha=0.1  # L2 regularization
)

In [33]:
# Set hyperparameter ranges 
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [34]:
# Define tuner object
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",   # Hyperparameter tuning happens on validation dataset. 
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 6. Data Channels

In [35]:
# Data channels are basically path to train/test/val data sets. 
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [36]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7efcd402f940>

In [37]:
val_data_channel = get_data_channel("val")

In [38]:
# Combine both the channels. Train on train dataset and validation on val dataset
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the 

Check training Job in Amazon SageMaker  --> Training jobs

Check Hyperparameter tuning Job in Amazon SageMaker --> Hyperparameter tuning jobs

Model is stored in bucket/model/output folder


In [39]:
tuner.fit(data_channels)  # In sagemaker we provide data channels instead of X_train/y_train

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


In [None]:
# To Deploy on stagemaker
# tuner.best_estimator().deploy()

## 8. Model Evaluation

In [41]:
# Read the best model ( Download model from s3 and upload to Notebook instance)
with open("xgboost-model", "rb") as f:  # rb - read binary
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7efcc9873400>

In [43]:
pd.read_csv("train-pre.csv") # This is how sagemaker requires data. Target is first column, rest are features

Unnamed: 0,7832,0.0,1.0,0.0.1,0.17647058823529405,0.1694915254237288,-0.8576292687747612,-0.8576292687747612.1,-0.36429129774092917,2.0,0,-0.03360013086605084,1.0.1,0.1
0,6540,0.0,1.0,0.0,0.235294,0.220339,1.065619,1.065619,-0.364291,2.0,0,0.046768,1.0,0
1,7305,0.0,0.0,1.0,0.058824,0.067797,-0.857629,-0.857629,2.372693,0.0,0,-0.917646,0.0,1
2,8366,0.0,0.0,0.0,0.882353,0.872881,-0.203923,-0.203923,-0.364291,2.0,0,-0.174244,1.0,0
3,11087,0.0,1.0,0.0,0.117647,0.093220,-0.857629,-0.857629,-0.364291,2.0,0,-0.214428,1.0,0
4,4544,0.0,1.0,0.0,0.411765,0.432203,-0.857629,-0.857629,-0.364291,1.0,0,-0.897554,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6688,11093,0.0,1.0,0.0,0.176471,0.169492,1.065619,1.065619,-0.364291,2.0,1,2.598445,2.0,0
6689,8891,0.0,0.0,0.0,0.529412,0.516949,-0.203923,-0.203923,-0.364291,2.0,0,-0.174244,1.0,0
6690,10262,0.0,1.0,0.0,0.764706,0.779661,1.065619,1.065619,-0.364291,1.0,0,-0.666496,1.0,0
6691,6152,0.0,0.0,1.0,1.000000,0.974576,1.065619,1.065619,-0.364291,1.0,0,-0.606221,1.0,0


In [45]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:]) # All rows and all columns starting from 1. Specific format for xgb
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [46]:
evaluate_model("train")

0.17020941513774768

In [47]:
evaluate_model("val")

0.1097596087009417

In [48]:
evaluate_model("test")

0.16687379243318

In [49]:
pwd

'/home/ec2-user/SageMaker'

In [50]:
ls

feature-engineering.ipynb  test.csv         train-pre.csv  xgboost-model
[0m[01;34mlost+found[0m/                test-pre.csv     val.csv
model_training.ipynb       train_final.csv  val-pre.csv


In [62]:
def upload_notebook_to_bucket(name):
    #file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join("notebooks", f"{name}"))
        .upload_file(name)
    )

In [63]:
upload_notebook_to_bucket("model_training.ipynb")