In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [97]:
train_data = pd.read_excel('Data_Train.xlsx')

In [98]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [99]:
#missing values
train_data.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [100]:
# rows & cols
train_data.shape

(10683, 11)

In [101]:
# remove missing values
train_data.dropna(inplace=True)
train_data.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [102]:
# data type
train_data.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [103]:
# Dep_Time, Arrival_Time & Date_of_Journey need to convert into datetime format
def change_into_datetime(col):
    train_data[col]=pd.to_datetime(train_data[col])

In [104]:
train_data.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

In [105]:
for i in['Date_of_Journey', 'Dep_Time', 'Arrival_Time']:
    change_into_datetime(i)

In [106]:
# data type
train_data.dtypes

Airline                    object
Date_of_Journey    datetime64[ns]
Source                     object
Destination                object
Route                      object
Dep_Time           datetime64[ns]
Arrival_Time       datetime64[ns]
Duration                   object
Total_Stops                object
Additional_Info            object
Price                       int64
dtype: object

In [107]:
## creating new col for day & month to understand by ml model
train_data['journey_day'] = train_data['Date_of_Journey'].dt.day
train_data['journey_mobth'] = train_data['Dep_Time'].dt.month

In [108]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_mobth
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2021-03-10 22:20:00,2021-03-22 01:10:00,2h 50m,non-stop,No info,3897,24,3
1,Air India,2019-01-05,Kolkata,Banglore,CCU → IXR → BBI → BLR,2021-03-10 05:50:00,2021-03-10 13:15:00,7h 25m,2 stops,No info,7662,5,3
2,Jet Airways,2019-09-06,Delhi,Cochin,DEL → LKO → BOM → COK,2021-03-10 09:25:00,2021-06-10 04:25:00,19h,2 stops,No info,13882,6,3
3,IndiGo,2019-12-05,Kolkata,Banglore,CCU → NAG → BLR,2021-03-10 18:05:00,2021-03-10 23:30:00,5h 25m,1 stop,No info,6218,5,3
4,IndiGo,2019-01-03,Banglore,New Delhi,BLR → NAG → DEL,2021-03-10 16:50:00,2021-03-10 21:35:00,4h 45m,1 stop,No info,13302,3,3


In [109]:
# removing Date_of_Journey 
train_data.drop('Date_of_Journey', axis=1, inplace=True)

In [110]:
# Extrack Hour
def extrack_hour(df, col):
    df[col+'_hour'] = df[col].dt.hour
# Extrack Minute
def extrack_min(df,col):
    df[col+'_minute']= df[col].dt.minute
#Drop/remove Column
def drop_column(df, col):
    df.drop(col, axis=1, inplace=True)
    

In [111]:
extrack_hour(train_data, 'Dep_Time')
extrack_min(train_data, 'Dep_Time')
drop_column(train_data, 'Dep_Time')

In [112]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_mobth,Dep_Time_hour,Dep_Time_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2021-03-22 01:10:00,2h 50m,non-stop,No info,3897,24,3,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2021-03-10 13:15:00,7h 25m,2 stops,No info,7662,5,3,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2021-06-10 04:25:00,19h,2 stops,No info,13882,6,3,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,2021-03-10 23:30:00,5h 25m,1 stop,No info,6218,5,3,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,2021-03-10 21:35:00,4h 45m,1 stop,No info,13302,3,3,16,50


In [113]:
extrack_hour(train_data, 'Arrival_Time')
extrack_min(train_data, 'Arrival_Time')
drop_column(train_data, 'Arrival_Time')

In [114]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_mobth,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,5,3,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,6,3,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,5,3,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,3,3,16,50,21,35


In [115]:
# logic
'3h 7m'.split(' ')

['3h', '7m']

In [116]:
duration = list(train_data['Duration'])
for i in range(len(duration)):
    if len(duration[i].split(' '))==2:
        pass
    else:
        if 'h' in duration[i]:
            duration[i] = duration[i] + ' 0m'
        else:
            duration[i] = '0h' + duration[i]


    

In [117]:
train_data['Duration'] = duration

In [118]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_mobth,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,5,3,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h 0m,2 stops,No info,13882,6,3,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,5,3,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,3,3,16,50,21,35


In [119]:
## Extrack Hour
def hour(x):
    return x.split(' ')[0][0:-1]

## Extrack Minute
def minute(x):
    return x.split(' ')[1][0:-1]

In [120]:
train_data['Duration_hours'] = train_data['Duration'].apply(hour)
train_data['Duration_mins'] = train_data['Duration'].apply(minute)

In [124]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_mobth,Dep_Time_hour,Dep_Time_minute,Arrival_Time_hour,Arrival_Time_minute,Duration_hours,Duration_mins
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,5,3,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h 0m,2 stops,No info,13882,6,3,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,5,3,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,3,3,16,50,21,35,4,45


In [125]:
# Remove Duration
drop_column(train_data, 'Duration')

In [126]:
train_data.dtypes

Airline                object
Source                 object
Destination            object
Route                  object
Total_Stops            object
Additional_Info        object
Price                   int64
journey_day             int64
journey_mobth           int64
Dep_Time_hour           int64
Dep_Time_minute         int64
Arrival_Time_hour       int64
Arrival_Time_minute     int64
Duration_hours         object
Duration_mins          object
dtype: object

In [127]:
train_data['Duration_hours'] = train_data['Duration_hours'].astype(int)
train_data['Duration_mins'] = train_data['Duration_mins'].astype(int)

In [128]:
# Converted data type to num(int)
train_data.dtypes

Airline                object
Source                 object
Destination            object
Route                  object
Total_Stops            object
Additional_Info        object
Price                   int64
journey_day             int64
journey_mobth           int64
Dep_Time_hour           int64
Dep_Time_minute         int64
Arrival_Time_hour       int64
Arrival_Time_minute     int64
Duration_hours          int64
Duration_mins           int64
dtype: object

In [133]:
#categorical data
cat_col = [col for col in train_data.columns if train_data[col].dtype=='O']
cat_col 

['Airline', 'Source', 'Destination', 'Route', 'Total_Stops', 'Additional_Info']

In [None]:
cat_col = [col for col in train_data.columns if train_data[col].dtype=='O']
cat_col 