In [None]:
!pip install -U feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.2-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.2-py2.py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.0/375.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature-engine
Successfully installed feature-engine-1.8.2


# Features Trasformations

In [None]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
sklearn.set_config(transform_output="pandas")

In [None]:
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-15,Chennai,Kolkata,19:35:00,21:55:00,140,0,No Info,3597
1,Air India,2019-06-12,Delhi,Cochin,17:15:00,09:25:00,970,2,No Info,9417
2,Air India,2019-03-06,Delhi,Cochin,09:00:00,07:40:00,1360,1,No Info,14882
3,Jet Airways,2019-05-06,Kolkata,Banglore,20:25:00,18:00:00,1295,1,In-flight meal not included,9314
4,Jet Airways,2019-06-15,Delhi,Cochin,09:00:00,12:35:00,1655,1,In-flight meal not included,10262


In [None]:
X_train = train.drop(columns="price")
y_train = train.price.copy()

In [None]:
X_train.columns

Index(['airline', 'date_of_journey', 'source', 'destination', 'dep_time',
       'arrival_time', 'duration', 'total_stops', 'additional_info'],
      dtype='object')

# 1. airline

In [None]:
X_train.airline.value_counts()

Unnamed: 0_level_0,count
airline,Unnamed: 1_level_1
Jet Airways,233
Indigo,141
Air India,106
Multiple Carriers,63
Spicejet,40
Vistara,23
Air Asia,20
Goair,14


- As per EDA we came to know that top 3 airlines has more than 15% counts and remaining has less than 15%. so we will club that into 1 category.

In [None]:
air_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),#tolerance = 10%..making group below 10%
	("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

air_transformer.fit_transform(X_train.loc[:, ["airline"]])#.airline.value_counts()

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Other
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
635,0.0,0.0,1.0,0.0
636,1.0,0.0,0.0,0.0
637,0.0,0.0,0.0,1.0
638,0.0,0.0,0.0,1.0


# 2. Date of Journey

In [None]:
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
	("scaler", MinMaxScaler())
])

doj_transformer.fit_transform(X_train.loc[:, ["date_of_journey"]])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,0.882353,0.833333,0.898305
1,1.000000,0.882353,0.333333,0.872881
2,0.000000,0.058824,0.333333,0.042373
3,0.666667,0.588235,0.000000,0.559322
4,1.000000,0.882353,0.833333,0.898305
...,...,...,...,...
635,0.000000,0.058824,0.333333,0.042373
636,0.000000,0.058824,0.333333,0.042373
637,0.000000,0.117647,0.166667,0.093220
638,1.000000,0.882353,0.333333,0.872881


# 3. source & destination

In [None]:
location_subset = X_train.loc[:, ["source", "destination"]]
location_subset

Unnamed: 0,source,destination
0,Chennai,Kolkata
1,Delhi,Cochin
2,Delhi,Cochin
3,Kolkata,Banglore
4,Delhi,Cochin
...,...,...
635,Delhi,Cochin
636,Delhi,Cochin
637,Kolkata,Banglore
638,Kolkata,Banglore


In [None]:
location_transformer = Pipeline(steps=[
	("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
	("encoder", MeanEncoder()),
	("scaler", PowerTransformer())
])

location_transformer.fit_transform(location_subset, y_train)

Unnamed: 0,source,destination
0,-1.983209,-0.078479
1,1.098537,1.046856
2,1.098537,1.046856
3,-0.459904,-0.521967
4,1.098537,1.046856
...,...,...
635,1.098537,1.046856
636,1.098537,1.046856
637,-0.459904,-0.521967
638,-0.459904,-0.521967


# 4.dep_time & arrival_time


In [None]:
time_subset = X_train.loc[:, ["dep_time", "arrival_time"]]
time_subset

Unnamed: 0,dep_time,arrival_time
0,19:35:00,21:55:00
1,17:15:00,09:25:00
2,09:00:00,07:40:00
3,20:25:00,18:00:00
4,09:00:00,12:35:00
...,...,...
635,13:00:00,18:50:00
636,05:55:00,07:40:00
637,19:35:00,22:05:00
638,07:35:00,19:25:00


In [None]:
time_pipe1 = Pipeline(steps=[
	("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
	("scaler", MinMaxScaler())
])

time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.826087,0.636364,0.913043,1.000000
1,0.739130,0.272727,0.391304,0.454545
2,0.391304,0.000000,0.304348,0.727273
3,0.869565,0.454545,0.782609,0.000000
4,0.391304,0.000000,0.521739,0.636364
...,...,...,...,...
635,0.565217,0.000000,0.782609,0.909091
636,0.217391,1.000000,0.304348,0.727273
637,0.826087,0.636364,0.956522,0.090909
638,0.304348,0.636364,0.826087,0.454545


In [None]:
def part_of_day(X, morning=4, noon=12, eve=16, night=20):
	columns = X.columns.to_list()
	X_temp = X.assign(**{
		col: pd.to_datetime(X.loc[:, col]).dt.hour
		for col in columns
	})

	return (
		X_temp
		.assign(**{
			f"{col}_part_of_day": np.select(
				[X_temp.loc[:, col].between(morning, noon, inclusive="left"),
				 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
				 X_temp.loc[:, col].between(eve, night, inclusive="left")],
				["morning", "afternoon", "evening"],
				default="night"
			)
			for col in columns
		})
		.drop(columns=columns)
	)

FunctionTransformer(func=part_of_day).fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,evening,night
1,evening,morning
2,morning,morning
3,night,evening
4,morning,afternoon
...,...,...
635,afternoon,evening
636,morning,morning
637,evening,night
638,morning,evening


In [None]:
time_pipe2 = Pipeline(steps=[
	("part", FunctionTransformer(func=part_of_day)),
	("encoder", CountFrequencyEncoder()),
	("scaler", MinMaxScaler())
])

time_pipe2.fit_transform(time_subset)

Unnamed: 0,dep_time_part_of_day,arrival_time_part_of_day
0,0.114428,0.925
1,0.114428,1.000
2,1.000000,1.000
3,0.059701,0.825
4,1.000000,0.000
...,...,...
635,0.000000,0.825
636,1.000000,1.000
637,0.114428,0.925
638,1.000000,0.825


In [None]:

time_transformer = FeatureUnion(transformer_list=[
	("part1", time_pipe1),
	("part2", time_pipe2)
])

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.826087,0.636364,0.913043,1.000000,0.114428,0.925
1,0.739130,0.272727,0.391304,0.454545,0.114428,1.000
2,0.391304,0.000000,0.304348,0.727273,1.000000,1.000
3,0.869565,0.454545,0.782609,0.000000,0.059701,0.825
4,0.391304,0.000000,0.521739,0.636364,1.000000,0.000
...,...,...,...,...,...,...
635,0.565217,0.000000,0.782609,0.909091,0.000000,0.825
636,0.217391,1.000000,0.304348,0.727273,1.000000,1.000
637,0.826087,0.636364,0.956522,0.090909,0.114428,0.925
638,0.304348,0.636364,0.826087,0.454545,1.000000,0.825


# 5. Duration

In [None]:
X_train.duration.describe()

Unnamed: 0,duration
count,640.0
mean,609.0
std,489.151096
min,75.0
25%,175.0
50%,467.5
75%,910.0
max,2295.0


In [None]:
duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler())
])

In [None]:
duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration
0,-0.963461
1,0.745028
2,1.547812
3,1.414014
4,2.155046
...,...
635,-0.531193
636,1.928620
637,-0.942877
638,0.209838


# 6. total_stops

In [None]:
X_train.total_stops.value_counts()

Unnamed: 0_level_0,count
total_stops,Unnamed: 1_level_1
1,344
0,223
2,72
3,1


In [None]:
def is_direct(X):
	return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("", FunctionTransformer(func=is_direct))
])

total_stops_transformer.fit_transform(X_train.loc[:, ["total_stops"]])

Unnamed: 0,total_stops,is_direct_flight
0,0,1
1,2,0
2,1,0
3,1,0
4,1,0
...,...,...
635,1,0
636,2,0
637,0,1
638,1,0


# 7. Additional_info

In [None]:
X_train.additional_info.value_counts()

Unnamed: 0_level_0,count
additional_info,Unnamed: 1_level_1
No Info,493
In-flight meal not included,127
No check-in baggage included,18
Red-eye flight,1
Change airports,1


In [None]:
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
635,0.0,1.0,0.0
636,0.0,1.0,0.0
637,0.0,1.0,0.0
638,0.0,1.0,0.0


In [None]:

def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [None]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [None]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,0.0,1.0,0.0,0
1,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0
3,1.0,0.0,0.0,1
4,1.0,0.0,0.0,1
...,...,...,...,...
635,0.0,1.0,0.0,0
636,0.0,1.0,0.0,0
637,0.0,1.0,0.0,0
638,0.0,1.0,0.0,0


# Column Trasformation

In [None]:
column_transformer = ColumnTransformer(transformers=[
	("air", air_transformer, ["airline"]),
	("doj", doj_transformer, ["date_of_journey"]),
	("location", location_transformer, ["source", 'destination']),
	("time", time_transformer, ["dep_time", "arrival_time"]),
	("dur", duration_transformer, ["duration"]),
	("stops", total_stops_transformer, ["total_stops"]),
	("info", info_transformer, ["additional_info"])
], remainder="passthrough")

column_transformer.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_week,doj__date_of_journey_day_of_year,location__source,location__destination,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,1.0,0.0,0.0,1.000000,0.882353,0.833333,0.898305,-1.983209,-0.078479,0.826087,0.636364,0.913043,1.000000,0.114428,0.925,-0.963461,0,1,0.0,1.0,0.0,0
1,1.0,0.0,0.0,0.0,1.000000,0.882353,0.333333,0.872881,1.098537,1.046856,0.739130,0.272727,0.391304,0.454545,0.114428,1.000,0.745028,2,0,0.0,1.0,0.0,0
2,1.0,0.0,0.0,0.0,0.000000,0.058824,0.333333,0.042373,1.098537,1.046856,0.391304,0.000000,0.304348,0.727273,1.000000,1.000,1.547812,1,0,0.0,1.0,0.0,0
3,0.0,0.0,1.0,0.0,0.666667,0.588235,0.000000,0.559322,-0.459904,-0.521967,0.869565,0.454545,0.782609,0.000000,0.059701,0.825,1.414014,1,0,1.0,0.0,0.0,1
4,0.0,0.0,1.0,0.0,1.000000,0.882353,0.833333,0.898305,1.098537,1.046856,0.391304,0.000000,0.521739,0.636364,1.000000,0.000,2.155046,1,0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,1.0,0.0,0.000000,0.058824,0.333333,0.042373,1.098537,1.046856,0.565217,0.000000,0.782609,0.909091,0.000000,0.825,-0.531193,1,0,0.0,1.0,0.0,0
636,1.0,0.0,0.0,0.0,0.000000,0.058824,0.333333,0.042373,1.098537,1.046856,0.217391,1.000000,0.304348,0.727273,1.000000,1.000,1.928620,2,0,0.0,1.0,0.0,0
637,0.0,0.0,0.0,1.0,0.000000,0.117647,0.166667,0.093220,-0.459904,-0.521967,0.826087,0.636364,0.956522,0.090909,0.114428,0.925,-0.942877,0,1,0.0,1.0,0.0,0
638,0.0,0.0,0.0,1.0,1.000000,0.882353,0.333333,0.872881,-0.459904,-0.521967,0.304348,0.636364,0.826087,0.454545,1.000000,0.825,0.209838,1,0,0.0,1.0,0.0,0


# Feature Selections

In [None]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
)

In [None]:
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

preprocessor.fit_transform(X_train, y_train)

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,doj__date_of_journey_month,doj__date_of_journey_week,location__destination,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,1.000000,0.882353,-0.078479,-0.963461,0,1
1,0.0,0.0,1.000000,0.882353,1.046856,0.745028,2,0
2,0.0,0.0,0.000000,0.058824,1.046856,1.547812,1,0
3,0.0,1.0,0.666667,0.588235,-0.521967,1.414014,1,0
4,0.0,1.0,1.000000,0.882353,1.046856,2.155046,1,0
...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.000000,0.058824,1.046856,-0.531193,1,0
636,0.0,0.0,0.000000,0.058824,1.046856,1.928620,2,0
637,0.0,0.0,0.000000,0.117647,-0.521967,-0.942877,0,1
638,0.0,0.0,1.000000,0.882353,-0.521967,0.209838,1,0


In [None]:
feature_performances = preprocessor.named_steps["selector"].feature_performance_
feature_performances

{'air__airline_Air India': -0.0010262784069215252,
 'air__airline_Indigo': 0.13260172869516285,
 'air__airline_Jet Airways': 0.17652245626535593,
 'air__airline_Other': 0.01881605277909452,
 'doj__date_of_journey_month': 0.12074908079303281,
 'doj__date_of_journey_week': 0.17450045138624345,
 'doj__date_of_journey_day_of_week': 0.0025764147486105524,
 'doj__date_of_journey_day_of_year': -0.38098312909749854,
 'location__source': 0.09468784744998789,
 'location__destination': 0.11213114704148575,
 'time__dep_time_hour': -0.028150314145945377,
 'time__dep_time_minute': -0.025511461952497833,
 'time__arrival_time_hour': 0.02096038969497775,
 'time__arrival_time_minute': 0.025842126948130817,
 'time__dep_time_part_of_day': -0.011484676654520984,
 'time__arrival_time_part_of_day': 0.005474803717362124,
 'dur__duration': 0.3660714597930073,
 'stops__total_stops': 0.3728892015161273,
 'stops__is_direct_flight': 0.35769118498572544,
 'info__additional_info_In-flight meal not included': -0.0033

In [None]:
sorted_feat_imp = dict(sorted(feature_performances.items(), key=lambda val: val[1]))
sorted_feat_imp

{'doj__date_of_journey_day_of_year': -0.38098312909749854,
 'time__dep_time_hour': -0.028150314145945377,
 'time__dep_time_minute': -0.025511461952497833,
 'time__dep_time_part_of_day': -0.011484676654520984,
 'info__additional_info_No Info': -0.006480481871296869,
 'info__additional_info': -0.006480481871296869,
 'info__additional_info_In-flight meal not included': -0.003315856180634199,
 'air__airline_Air India': -0.0010262784069215252,
 'doj__date_of_journey_day_of_week': 0.0025764147486105524,
 'time__arrival_time_part_of_day': 0.005474803717362124,
 'air__airline_Other': 0.01881605277909452,
 'time__arrival_time_hour': 0.02096038969497775,
 'info__additional_info_Other': 0.023303321342738936,
 'time__arrival_time_minute': 0.025842126948130817,
 'location__source': 0.09468784744998789,
 'location__destination': 0.11213114704148575,
 'doj__date_of_journey_month': 0.12074908079303281,
 'air__airline_Indigo': 0.13260172869516285,
 'doj__date_of_journey_week': 0.17450045138624345,
 'ai