## Predict fare of Airline tickets using machine learning

In this project we will predict tickets price of airline using Machine Learning. We're using Kaggle's Flight Price Prediction data. 

In [2]:
# Import all the needed modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
%matplotlib inline

### Data


In [8]:
# Load data
train_data=pd.read_excel(r"C:\Users\DELL\Downloads\Data_Train.xlsx")

In [9]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [10]:
train_data.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [12]:
train_data.shape

(10683, 11)

**So our training data has 10683 rows and have 10 feature variables and 1 label or target column**

### Data Preprocessing

In [14]:
# Check for missing values 
train_data.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [15]:
# drop that missing value row
train_data.dropna(inplace=True)
train_data.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [16]:
train_data.shape

(10682, 11)

In [17]:
train_data.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2h 25m,non-stop,No info,3873
6,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15h 30m,1 stop,In-flight meal not included,11087
7,Jet Airways,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21h 5m,1 stop,No info,22270
8,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25h 30m,1 stop,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7h 50m,1 stop,No info,8625


In [18]:
# Let's change datatypes of Date_of_Journey to datetime datatype
train_data["Date_of_Journey"]=pd.to_datetime(train_data["Date_of_Journey"])

In [19]:
train_data.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [20]:
def change_to_datetime(col):
    train_data[col]=pd.to_datetime(train_data[col])

In [22]:
for i in ["Dep_Time","Arrival_Time"]:
    change_to_datetime(i)
    

In [23]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10682 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Airline          10682 non-null  object        
 1   Date_of_Journey  10341 non-null  datetime64[ns]
 2   Source           10682 non-null  object        
 3   Destination      10682 non-null  object        
 4   Route            10682 non-null  object        
 5   Dep_Time         10682 non-null  datetime64[ns]
 6   Arrival_Time     10682 non-null  datetime64[ns]
 7   Duration         10682 non-null  object        
 8   Total_Stops      10682 non-null  object        
 9   Additional_Info  10682 non-null  object        
 10  Price            10682 non-null  int64         
dtypes: datetime64[ns](3), int64(1), object(7)
memory usage: 1001.4+ KB


In [24]:
train_data["Journey_day"]=train_data["Date_of_Journey"].dt.day
train_data["Journey_month"]=train_data["Date_of_Journey"].dt.month

In [27]:
train_data.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2022-01-27 22:20:00,2022-03-22 01:10:00,2h 50m,non-stop,No info,3897,24.0,3.0
1,Air India,2019-01-05,Kolkata,Banglore,CCU → IXR → BBI → BLR,2022-01-27 05:50:00,2022-01-27 13:15:00,7h 25m,2 stops,No info,7662,5.0,1.0
2,Jet Airways,2019-09-06,Delhi,Cochin,DEL → LKO → BOM → COK,2022-01-27 09:25:00,2022-06-10 04:25:00,19h,2 stops,No info,13882,6.0,9.0
3,IndiGo,2019-12-05,Kolkata,Banglore,CCU → NAG → BLR,2022-01-27 18:05:00,2022-01-27 23:30:00,5h 25m,1 stop,No info,6218,5.0,12.0
4,IndiGo,2019-01-03,Banglore,New Delhi,BLR → NAG → DEL,2022-01-27 16:50:00,2022-01-27 21:35:00,4h 45m,1 stop,No info,13302,3.0,1.0
5,SpiceJet,2019-06-24,Kolkata,Banglore,CCU → BLR,2022-01-27 09:00:00,2022-01-27 11:25:00,2h 25m,non-stop,No info,3873,24.0,6.0
6,Jet Airways,2019-12-03,Banglore,New Delhi,BLR → BOM → DEL,2022-01-27 18:55:00,2022-03-13 10:25:00,15h 30m,1 stop,In-flight meal not included,11087,3.0,12.0
7,Jet Airways,2019-01-03,Banglore,New Delhi,BLR → BOM → DEL,2022-01-27 08:00:00,2022-03-02 05:05:00,21h 5m,1 stop,No info,22270,3.0,1.0
8,Jet Airways,2019-12-03,Banglore,New Delhi,BLR → BOM → DEL,2022-01-27 08:55:00,2022-03-13 10:25:00,25h 30m,1 stop,In-flight meal not included,11087,3.0,12.0
9,Multiple carriers,2019-05-27,Delhi,Cochin,DEL → BOM → COK,2022-01-27 11:25:00,2022-01-27 19:15:00,7h 50m,1 stop,No info,8625,27.0,5.0


In [28]:
# Drop the Date of Journey Column
train_data.drop("Date_of_Journey",axis=1,inplace=True)
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month
0,IndiGo,Banglore,New Delhi,BLR → DEL,2022-01-27 22:20:00,2022-03-22 01:10:00,2h 50m,non-stop,No info,3897,24.0,3.0
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2022-01-27 05:50:00,2022-01-27 13:15:00,7h 25m,2 stops,No info,7662,5.0,1.0
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2022-01-27 09:25:00,2022-06-10 04:25:00,19h,2 stops,No info,13882,6.0,9.0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,2022-01-27 18:05:00,2022-01-27 23:30:00,5h 25m,1 stop,No info,6218,5.0,12.0
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,2022-01-27 16:50:00,2022-01-27 21:35:00,4h 45m,1 stop,No info,13302,3.0,1.0


In [30]:
# Drop the Additional_Info Column as it's not adding any value to the data
train_data.drop("Additional_Info",axis=1,inplace=True)