## **Flight Price Prediction (Feature Engineering)**

### **1. Importing Libraries**

In [337]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

#### **2. Importing Datasets**

In [338]:
df_train = pd.read_excel('Data_Train.xlsx')
df_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [339]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [340]:
df_test = pd.read_excel('Test_set.xlsx')
df_test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [341]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          2671 non-null   object
 1   Date_of_Journey  2671 non-null   object
 2   Source           2671 non-null   object
 3   Destination      2671 non-null   object
 4   Route            2671 non-null   object
 5   Dep_Time         2671 non-null   object
 6   Arrival_Time     2671 non-null   object
 7   Duration         2671 non-null   object
 8   Total_Stops      2671 non-null   object
 9   Additional_Info  2671 non-null   object
dtypes: object(10)
memory usage: 208.8+ KB


In [342]:
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0


In [343]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13354 entries, 0 to 2670
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.2+ MB


### **3. Feature Engineering Process**

##### **3.1 Handling Date of Journey**

In [344]:
df['Date'] = df['Date_of_Journey'].apply(lambda x:x.split('/')[0])
df['Month'] = df['Date_of_Journey'].apply(lambda x:x.split('/')[1])
df['Year'] = df['Date_of_Journey'].apply(lambda x:x.split('/')[2])

df['Date'] = df['Date'].astype(int)
df['Month'] = df['Month'].astype(int)
df['Year'] = df['Year'].astype(int)

df.drop('Date_of_Journey', axis=1, inplace=True)

##### **3.2 Handling Arrival Time**

In [345]:
df['Arrival_Time'] = df['Arrival_Time'].apply(lambda x:x.split(' ')[0])
df['Arrival_hour'] = df['Arrival_Time'].apply(lambda x:x.split(':')[0])
df['Arrival_min'] = df['Arrival_Time'].apply(lambda x:x.split(':')[1])\

df['Arrival_hour'] = df['Arrival_hour'].astype(int)
df['Arrival_min'] = df['Arrival_min'].astype(int)

df.drop('Arrival_Time', axis=1, inplace=True)

##### **3.3 Handling Departure time**

In [346]:
df['Dept_hour'] = df['Dep_Time'].apply(lambda x:x.split(':')[0]) 
df['Dept_min'] = df['Dep_Time'].apply(lambda x:x.split(':')[1])

df['Dept_hour'] = df['Dept_hour'].astype(int)
df['Dept_min'] = df['Dept_min'].astype(int)

df.drop('Dep_Time', inplace=True, axis=1)

##### **3.4 Handling Total Stops**

In [347]:
df['Total_Stops'] = df['Total_Stops'].map({'non-stop':0, '1 stop':1, '2 stops':2, '3 stops':3, '4 stops':4})

df['Total_Stops'].fillna(4, inplace=True)

df.drop('Route', axis=1, inplace=True)

df['Total_Stops'] = df['Total_Stops'].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Total_Stops'].fillna(4, inplace=True)


##### **3.5 Handling Duration**

In [348]:
df['Duration_hour'] = df['Duration'].apply(lambda x:x.split(' ')[0].split('h')[0])
df['Duration_min'] = df['Duration'].str.split(' ').str[1].str.split('m').str[0]

df['Duration_min'].fillna(0, inplace=True)

df.drop(6474, axis=0, inplace=True)
df.drop(2660, axis=0, inplace=True)

df['Duration_hour'] = df['Duration_hour'].astype(int)
df['Duration_min'] = df['Duration_min'].astype(int)

df.drop('Duration', axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration_min'].fillna(0, inplace=True)


##### **3.6 Handling Categorical Features**

In [349]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
encoder = OrdinalEncoder()

object_features = df.select_dtypes(include=['object']).columns.tolist()

for col in object_features:
    df[col] = encoder.fit_transform(df[[col]])

In [350]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min,Duration_hour,Duration_min
0,3.0,0.0,5.0,0,8.0,3897.0,24,3,2019,1,10,22,20,2,50
1,1.0,3.0,0.0,2,8.0,7662.0,1,5,2019,13,15,5,50,7,25
2,4.0,2.0,1.0,2,8.0,13882.0,9,6,2019,4,25,9,25,19,0
3,3.0,3.0,0.0,1,8.0,6218.0,12,5,2019,23,30,18,5,5,25
4,3.0,0.0,5.0,1,8.0,13302.0,1,3,2019,21,35,16,50,4,45


In [351]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13351 entries, 0 to 2670
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13351 non-null  float64
 1   Source           13351 non-null  float64
 2   Destination      13351 non-null  float64
 3   Total_Stops      13351 non-null  int32  
 4   Additional_Info  13351 non-null  float64
 5   Price            10681 non-null  float64
 6   Date             13351 non-null  int32  
 7   Month            13351 non-null  int32  
 8   Year             13351 non-null  int32  
 9   Arrival_hour     13351 non-null  int32  
 10  Arrival_min      13351 non-null  int32  
 11  Dept_hour        13351 non-null  int32  
 12  Dept_min         13351 non-null  int32  
 13  Duration_hour    13351 non-null  int32  
 14  Duration_min     13351 non-null  int32  
dtypes: float64(5), int32(10)
memory usage: 1.1 MB
