Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings(action="ignore")
plt.style.use(['seaborn-bright','dark_background'])

In [2]:
train_df = pd.read_excel('Data_Train.xlsx')
train_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [3]:
train_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [4]:
train_df.rename(columns={'Airline':'airline',
                         'Date_of_Journey':'date',
                         'Source':'source',
                         'Destination':'destination',
                         'Route':'route',
                         'Dep_Time':'dept_time',
                         'Arrival_Time':'arrival_time',
                         'Duration':'duration',
                         'Total_Stops':'total_stops',
                         'Additional_Info':'additional_info',
                         'Price':'price'}, inplace=True)

In [5]:
train_df.isnull().sum()

airline            0
date               0
source             0
destination        0
route              1
dept_time          0
arrival_time       0
duration           0
total_stops        1
additional_info    0
price              0
dtype: int64

In [6]:
train_df.shape

(10683, 11)

Drop null and duplicated values

In [7]:
train_df.dropna(axis=0, inplace=True)

In [8]:
train_df.drop_duplicates(keep='first',inplace=True)

In [9]:
train_df.shape

(10462, 11)

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10462 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   airline          10462 non-null  object
 1   date             10462 non-null  object
 2   source           10462 non-null  object
 3   destination      10462 non-null  object
 4   route            10462 non-null  object
 5   dept_time        10462 non-null  object
 6   arrival_time     10462 non-null  object
 7   duration         10462 non-null  object
 8   total_stops      10462 non-null  object
 9   additional_info  10462 non-null  object
 10  price            10462 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 980.8+ KB


Feature engineering

In [11]:
train_df['total_stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [12]:
stops_map = {'non-stop':0,
             '2 stops':2,
             '1 stop':1, 
             '3 stops':3,
             '4 stops':4}

In [13]:
train_df['total_stops'] = train_df['total_stops'].map(stops_map)

In [14]:
train_df.head()

Unnamed: 0,airline,date,source,destination,route,dept_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302


In [15]:
train_df.shape

(10462, 11)

In [16]:
def date_convert(value):
    try:
        date = datetime.strptime(value, "%d/%m/%Y")
    except:
        date = 0
    return date

In [17]:
train_df['date'] = train_df['date'].apply(date_convert)
train_df = train_df[train_df['date']!=0]

In [18]:
train_df.shape

(10462, 11)

In [19]:
train_df

Unnamed: 0,airline,date,source,destination,route,dept_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,4107
10679,Air India,2019-04-27,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,4145
10680,Jet Airways,2019-04-27,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,0,No info,7229
10681,Vistara,2019-03-01,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,12648


In [20]:
train_df['date'] = train_df['date'].astype('datetime64')

In [21]:
def duration_in_min(value):
    if len(value)<=3 and value[-1]=='m':
        minutes = int(value.split('m')[0])
    elif len(value)<=3 and value[-1]=='h':
         minutes = int(value.split('h')[0])*60
    else:
        val1 = value.split('h')[0]
        val2 = value.split('h ')[1]
        val3 = val2.split('m')[0]
        minutes = int(val1)*60+int(val3)
    return minutes

In [22]:
train_df['duration_in_min'] = train_df['duration'].apply(duration_in_min)

In [23]:
train_df.head()

Unnamed: 0,airline,date,source,destination,route,dept_time,arrival_time,duration,total_stops,additional_info,price,duration_in_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,170
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,445
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,13882,1140
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,325
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,285


In [24]:
for i in ['dept_time', 'arrival_time']:
    train_df[i]=pd.to_datetime(train_df[i])

In [25]:
train_df['journey_day'] = train_df['date'].dt.day
train_df['journey_month'] = train_df['date'].dt.month

In [26]:
train_df['dept_hour'] = train_df['dept_time'].dt.hour
train_df['dept_min'] = train_df['dept_time'].dt.minute

In [27]:
train_df['arrv_hour'] = train_df['arrival_time'].dt.hour
train_df['arrv_min'] = train_df['arrival_time'].dt.minute

In [28]:
train_df.head()

Unnamed: 0,airline,date,source,destination,route,dept_time,arrival_time,duration,total_stops,additional_info,price,duration_in_min,journey_day,journey_month,dept_hour,dept_min,arrv_hour,arrv_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2021-10-23 22:20:00,2021-03-22 01:10:00,2h 50m,0,No info,3897,170,24,3,22,20,1,10
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,2021-10-23 05:50:00,2021-10-23 13:15:00,7h 25m,2,No info,7662,445,1,5,5,50,13,15
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,2021-10-23 09:25:00,2021-06-10 04:25:00,19h,2,No info,13882,1140,9,6,9,25,4,25
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,2021-10-23 18:05:00,2021-10-23 23:30:00,5h 25m,1,No info,6218,325,12,5,18,5,23,30
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,2021-10-23 16:50:00,2021-10-23 21:35:00,4h 45m,1,No info,13302,285,1,3,16,50,21,35


In [29]:
train_df.drop(columns=['date','route','dept_time','arrival_time','duration'],inplace=True)

In [30]:
train_df.head()

Unnamed: 0,airline,source,destination,total_stops,additional_info,price,duration_in_min,journey_day,journey_month,dept_hour,dept_min,arrv_hour,arrv_min
0,IndiGo,Banglore,New Delhi,0,No info,3897,170,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,2,No info,7662,445,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,2,No info,13882,1140,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,1,No info,6218,325,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,1,No info,13302,285,1,3,16,50,21,35


In [31]:
train_df['destination'] = train_df['destination'].replace("New Delhi","Delhi")
train_df['additional_info'] = train_df['additional_info'].replace('No info','No Info')

In [32]:
train_df.to_csv('final_df.csv',index=False)

In [33]:
train_df = pd.read_csv('final_df.csv')
train_df.head()

Unnamed: 0,airline,source,destination,total_stops,additional_info,price,duration_in_min,journey_day,journey_month,dept_hour,dept_min,arrv_hour,arrv_min
0,IndiGo,Banglore,Delhi,0,No Info,3897,170,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,2,No Info,7662,445,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,2,No Info,13882,1140,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,1,No Info,6218,325,12,5,18,5,23,30
4,IndiGo,Banglore,Delhi,1,No Info,13302,285,1,3,16,50,21,35


In [34]:
train_df['destination'].unique()

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

Model trianing and evaluation

In [35]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [36]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse=False,drop='first'),['airline','source','destination','additional_info'])
]
,remainder='passthrough')

In [37]:
X = train_df.drop(columns=['price'])
y = train_df['price']

In [42]:
from sklearn.model_selection import train_test_split
x_train , x_valid, y_train, y_valid = train_test_split(X,y,test_size=0.25)

In [39]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import r2_score, mean_absolute_error

In [40]:
models = [LinearRegression(),
          Ridge(),
         Lasso(),
         DecisionTreeRegressor(),
         ExtraTreesRegressor(),
         RandomForestRegressor(),
         GradientBoostingRegressor(),
         SVR(),
         KNeighborsRegressor(),
         XGBRegressor(),
         GaussianNB(),
         BernoulliNB()]

In [44]:
for i in models:
    pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',i)
    ])
    print(i)
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict(x_valid)
    print("R2 = ",r2_score(y_valid,y_pred))
    print("MAE = ",mean_absolute_error(y_valid,y_pred))
    print()

LinearRegression()
R2 =  0.6660716893670242
MAE =  1778.616462064187

Ridge()
R2 =  0.6660826597554637
MAE =  1777.3547329514865

Lasso()
R2 =  0.6660986042255411
MAE =  1777.5147459005057

DecisionTreeRegressor()
R2 =  0.8213316991012578
MAE =  770.3625764525993

ExtraTreesRegressor()
R2 =  0.8940854070626029
MAE =  636.8248254332314

RandomForestRegressor()
R2 =  0.870844693448803
MAE =  702.0486258828455

GradientBoostingRegressor()
R2 =  0.8196494798108406
MAE =  1254.2231489349933

SVR()
R2 =  0.05940699820111561
MAE =  3308.210502404541

KNeighborsRegressor()
R2 =  0.8177382251741699
MAE =  1036.572629969419

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constrai

In [45]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor())
    ])
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_valid)
print(r2_score(y_valid,y_pred))
print(mean_absolute_error(y_valid,y_pred))

0.8840707531141611
794.2338830323758


In [46]:
import pickle

In [47]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [48]:
model = pickle.load(open('pipe.pkl','rb'))

In [49]:
x_train.columns

Index(['airline', 'source', 'destination', 'total_stops', 'additional_info',
       'duration_in_min', 'journey_day', 'journey_month', 'dept_hour',
       'dept_min', 'arrv_hour', 'arrv_min'],
      dtype='object')

In [50]:
train_df['airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [51]:
train_df['source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [52]:
train_df['destination'].unique()

array(['Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Hyderabad'],
      dtype=object)

In [53]:
train_df['additional_info'].unique()

array(['No Info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [54]:
airline = 'IndiGo'
source =  'Delhi'
destination = 'Banglore'
total_stops = 1
additional_info = 'No Info'
duration_in_min = 300
journey_day = 15
journey_month = 5
dept_hour = 11
dept_min = 50
arrv_hour = 15
arrv_min = 34

In [55]:
data = pd.DataFrame({'airline':[airline], 'source':[source], 'destination':[destination],
                     'total_stops':[total_stops], 'additional_info':[additional_info],
       'duration_in_min':[duration_in_min], 'journey_day':[journey_day], 'journey_month':[journey_month],
                     'dept_hour':[dept_hour],'dept_min':[dept_min],
                     'arrv_hour':[arrv_hour], 'arrv_min':[arrv_min]})

In [56]:
data

Unnamed: 0,airline,source,destination,total_stops,additional_info,duration_in_min,journey_day,journey_month,dept_hour,dept_min,arrv_hour,arrv_min
0,IndiGo,Delhi,Banglore,1,No Info,300,15,5,11,50,15,34


In [57]:
pred = float(model.predict(data))
pred1 = pred-(pred*0.15)
pred2 = pred+(pred*0.15)

In [58]:
pred

6089.314453125

In [59]:
pred1,pred2

(5175.91728515625, 7002.71162109375)