## **Importing Data and Libraries**

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
import datetime

In [3]:
tf.__version__

'2.3.0'

In [4]:
train = pd.read_excel('/content/drive/My Drive/flight fare prediction_/Data_Train.xlsx')

In [5]:
train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## **Basic EDA and Data Preprocessing on Train Data**

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [7]:
train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [8]:
train.dropna(inplace=True)

In [9]:
train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [10]:
Airline = pd.get_dummies(train['Airline'], drop_first=True)
Source = pd.get_dummies(train['Source'], drop_first=True)
Destination = pd.get_dummies(train['Destination'], drop_first=True)

Grabbing Date and Month of Journey

In [11]:
train['Date_of_Journey'] = pd.to_datetime(train['Date_of_Journey'],format= "%d/%m/%Y")

In [12]:

train['Day_of_journey'] = train['Date_of_Journey'].dt.day_name()
train['Month_of_journey'] = train['Date_of_Journey'].dt.month_name()

train.drop(['Date_of_Journey'], axis = 1 , inplace = True)

In [13]:
train.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day_of_journey,Month_of_journey
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,Sunday,March
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,Wednesday,May
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,Sunday,June
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,Sunday,May
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,Friday,March


In [14]:

train.drop(['Additional_Info', 'Route'], axis=1, inplace=True)

In [15]:
train.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price,Day_of_journey,Month_of_journey
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897,Sunday,March
1,Air India,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662,Wednesday,May
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882,Sunday,June
3,IndiGo,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218,Sunday,May
4,IndiGo,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302,Friday,March


In [16]:

journey_day = pd.get_dummies(train['Day_of_journey'], drop_first = True)
journey_month = pd.get_dummies(train['Month_of_journey'], drop_first = True)







Grabbing Arrival and Departure Hours and Minutes

In [17]:
train['Departure_hour'] = pd.to_datetime(train['Dep_Time']).dt.hour
train['Departure_min'] = pd.to_datetime(train['Dep_Time']).dt.minute

train['Arrival_hour'] = pd.to_datetime(train['Arrival_Time']).dt.hour
train['Arrival_min'] = pd.to_datetime(train['Arrival_Time']).dt.minute

train.drop(['Dep_Time', 'Arrival_Time'], axis=1, inplace=True)

Changing Total Stops to Numbers as they are Ordinal Categorical Data

In [18]:
train['Total_Stops'].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [19]:
train.replace({'non-stop':0, '1 stop':1, '2 stops':2, '3 stops':3, '4 stops':4}, inplace=True)
train.drop(['Airline','Source','Destination'], axis=1, inplace=True)

In [20]:
int(train['Duration'][2].split()[0].split('h')[0])

19

In [21]:
duration = list(train['Duration'])
for i in range(len(duration)):
  if len(duration[i].split()) != 2:
    if'h' in duration[i]:
      duration[i]=duration[i].strip() + ' 0m'
    else:
      duration[i]='0h ' + duration[i]

In [22]:
int(duration[0].split()[1].split('m')[0])

50

In [23]:
duration_hour=[]
duration_min=[]

for i in range(len(duration)):
  duration_hour.append(int(duration[i].split()[0].split('h')[0]))
  duration_min.append(int(duration[i].split()[1].split('m')[0]))

In [24]:
duration_hour=np.array(duration_hour)
duration_min=np.array(duration_min)

duration_hour=pd.DataFrame(duration_hour)
duration_min=pd.DataFrame(duration_min)

duration_hour.rename(columns={0:'duration_hour'}, inplace=True)
duration_min.rename(columns={0:'duration_min'}, inplace=True)

In [25]:
train_final = pd.concat([train, Airline, Source, Destination, duration_hour, duration_min, journey_day, journey_month], axis=1)
train_final.drop('Duration', axis=1, inplace=True)
train_final.head()

Unnamed: 0,Total_Stops,Price,Day_of_journey,Month_of_journey,Departure_hour,Departure_min,Arrival_hour,Arrival_min,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi,duration_hour,duration_min,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,June,March,May
0,0.0,3897.0,Sunday,March,22.0,20.0,1.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,7662.0,Wednesday,May,5.0,50.0,13.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,25.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2.0,13882.0,Sunday,June,9.0,25.0,4.0,25.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,6218.0,Sunday,May,18.0,5.0,23.0,30.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,25.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,13302.0,Friday,March,16.0,50.0,21.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
train_final.drop(['Day_of_journey', 'Month_of_journey', 'Trujet'], axis=1, inplace=True)

In [27]:
train_final

Unnamed: 0,Total_Stops,Price,Departure_hour,Departure_min,Arrival_hour,Arrival_min,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi,duration_hour,duration_min,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,June,March,May
0,0.0,3897.0,22.0,20.0,1.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,7662.0,5.0,50.0,13.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,25.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2.0,13882.0,9.0,25.0,4.0,25.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,6218.0,18.0,5.0,23.0,30.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,25.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,13302.0,16.0,50.0,21.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,0.0,4107.0,19.0,55.0,22.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,35.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10679,0.0,4145.0,20.0,45.0,23.0,20.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10680,0.0,7229.0,8.0,20.0,11.0,20.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10681,0.0,12648.0,11.0,30.0,14.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [28]:
train_final.isnull().sum()

Total_Stops                          1
Price                                1
Departure_hour                       1
Departure_min                        1
Arrival_hour                         1
Arrival_min                          1
Air India                            1
GoAir                                1
IndiGo                               1
Jet Airways                          1
Jet Airways Business                 1
Multiple carriers                    1
Multiple carriers Premium economy    1
SpiceJet                             1
Vistara                              1
Vistara Premium economy              1
Chennai                              1
Delhi                                1
Kolkata                              1
Mumbai                               1
Cochin                               1
Delhi                                1
Hyderabad                            1
Kolkata                              1
New Delhi                            1
duration_hour            

In [29]:
train_final.dropna(inplace=True)

In [30]:
train_final.isnull().sum()

Total_Stops                          0
Price                                0
Departure_hour                       0
Departure_min                        0
Arrival_hour                         0
Arrival_min                          0
Air India                            0
GoAir                                0
IndiGo                               0
Jet Airways                          0
Jet Airways Business                 0
Multiple carriers                    0
Multiple carriers Premium economy    0
SpiceJet                             0
Vistara                              0
Vistara Premium economy              0
Chennai                              0
Delhi                                0
Kolkata                              0
Mumbai                               0
Cochin                               0
Delhi                                0
Hyderabad                            0
Kolkata                              0
New Delhi                            0
duration_hour            

In [31]:
train_final.isna().sum()

Total_Stops                          0
Price                                0
Departure_hour                       0
Departure_min                        0
Arrival_hour                         0
Arrival_min                          0
Air India                            0
GoAir                                0
IndiGo                               0
Jet Airways                          0
Jet Airways Business                 0
Multiple carriers                    0
Multiple carriers Premium economy    0
SpiceJet                             0
Vistara                              0
Vistara Premium economy              0
Chennai                              0
Delhi                                0
Kolkata                              0
Mumbai                               0
Cochin                               0
Delhi                                0
Hyderabad                            0
Kolkata                              0
New Delhi                            0
duration_hour            

In [32]:
train_final.to_csv('/content/drive/My Drive/flight fare prediction_/Train_final.csv')

In [32]:
x_train = train_final.drop('Price', axis = 1)

In [33]:
x_train

Unnamed: 0,Total_Stops,Departure_hour,Departure_min,Arrival_hour,Arrival_min,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi,duration_hour,duration_min,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,June,March,May
0,0.0,22.0,20.0,1.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,5.0,50.0,13.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,25.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,2.0,9.0,25.0,4.0,25.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,18.0,5.0,23.0,30.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,25.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,16.0,50.0,21.0,35.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10677,0.0,5.0,55.0,8.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,30.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
10678,0.0,19.0,55.0,22.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,35.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10679,0.0,20.0,45.0,23.0,20.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10680,0.0,8.0,20.0,11.0,20.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,40.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
y_train = train_final['Price']

In [35]:
y_train

0         3897.0
1         7662.0
2        13882.0
3         6218.0
4        13302.0
          ...   
10677     3257.0
10678     4107.0
10679     4145.0
10680     7229.0
10681    12648.0
Name: Price, Length: 10681, dtype: float64

In [36]:
y_train.shape

(10681,)

In [38]:
y_train = y_train.to_numpy()

In [39]:
y_train = pd.DataFrame(y_train, columns=['Price'])

In [40]:
y_train

Unnamed: 0,Price
0,3897.0
1,7662.0
2,13882.0
3,6218.0
4,13302.0
...,...
10676,3257.0
10677,4107.0
10678,4145.0
10679,7229.0


In [41]:
x_train.to_csv('/content/drive/My Drive/flight fare prediction_/x_train.csv')

In [42]:
y_train.to_csv('/content/drive/My Drive/flight fare prediction_/y_train.csv')