In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib
# To show diagrams inline.
%matplotlib inline

In [None]:
df_train = pd.read_excel('train_data.xlsx')
df_test = pd.read_excel('test_data.xlsx')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df.info()

In [None]:
print(df_train.shape, df_test.shape, df.shape)

In [None]:
# Create derived features for Date_Of_Journey
date = df['Date_of_Journey'].str.split('/', expand=True)
date.columns = ['Day', 'Month', 'Year']
df['Day'] = date['Day'].astype(int)
df['Month'] = date['Month'].astype(int)
df['Year'] = date['Year'].astype(int)
df.drop(['Date_of_Journey'], axis=1, inplace=True)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Observation
1. Day, Month and Year are now integers.
2. Route and Total_Stops have one null value.
3. Null value of Price is because of test data

In [None]:
# Focus on Arrival_Time and Departure_Time
df['Arrival_Time'] = df['Arrival_Time'].str.split(' ').str[0]
# Above can also be done using lambda function
# df['Arrival_Time'] = df['Arrival_Time'].apply(lambda x: x.split(' ')[0])


In [None]:
# Convert Arrival_Time to hours and minutes
df['Arrival_Hour'] = df['Arrival_Time'].str.split(':').str[0].astype(int)
df['Arrival_Minute'] = df['Arrival_Time'].str.split(':').str[1].astype(int)
# Drop Arrival_Time column
df.drop(['Arrival_Time'], axis=1, inplace=True)
df.head()

In [None]:
# Convert Dep_Time to hours and minutes
df['Departure_Hour'] = df['Dep_Time'].str.split(':').str[0].astype(int)
df['Departure_Minute'] = df['Dep_Time'].str.split(':').str[1].astype(int)
# Drop Dep_Time column
df.drop(['Dep_Time'], axis=1, inplace=True)
df.head()

### Observation
1. We can drop Routes as we already have Source, Destination and Total_Stops. To predict the price, Source, Destination and Total_Stops would be required.
2. We would need to convert Total_Stops into integers.

In [None]:
# Drop Route column
df.drop(['Route'], axis=1, inplace=True)

In [None]:
# Working on Total_Stops
# Checking for null values in Total_Stops column
df['Total_Stops'].isnull().sum()
# Finding out the null values in Total_Stops column
df[df['Total_Stops'].isnull()]

In [None]:
# Filling null values with 'non-stop'. Since it is just one row, so this won't make much difference
df['Total_Stops'].fillna('non-stop', inplace=True)
df['Total_Stops'] = df['Total_Stops'].map({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})
df.head()

In [None]:
df['Duration'].unique()

### Observation
1. There are three types of values for Duration-
  - **h **m
  - **h
  - **m
2. We need to handle this carefully.

In [None]:
duration = df['Duration'].str.split(' ')
duration_hours_and_minutes = pd.DataFrame([dur for dur in duration if len(dur) == 2])
duration_hours_or_minutes = pd.DataFrame([dur for dur in duration if len(dur) == 1])

# Verify the filtered data
duration_hours_and_minutes.head(), duration_hours_or_minutes.head()

In [None]:
duration_hours_or_minutes[0].unique()

### Observation
1. There are instances where flight duration is only 5 minutes.
2. Practically these values aren't possible so we should drop them.

In [None]:
duration_hours_or_minutes[duration_hours_or_minutes[0] == '5m']

In [None]:
# Drop the rows with only 5 minutes of duration
duration_hours_or_minutes.drop(595, inplace=True, axis=0)
duration_hours_or_minutes.drop(1285, inplace=True, axis=0)
df.drop([595, 1285], inplace=True, axis=0)
duration_hours_or_minutes[0].unique()

In [None]:
# Appending 0m to rows with only hours in duration_hours_or_minutes
duration_hours_or_minutes[1] = ' 0m'
duration_hours_or_minutes.head()

In [None]:
# Appending duration_hours_or_minutes to duration_hours_and_minutes
duration_hours_and_minutes = pd.concat([duration_hours_and_minutes, duration_hours_or_minutes], axis=0, ignore_index=True)
print(df['Duration'].shape, duration_hours_and_minutes.shape)

In [None]:
# Convert duration_hours_and_minutes to hours and minutes
duration_hour = duration_hours_and_minutes[0]
duration_minute = duration_hours_and_minutes[1]
duration_hour.head(), duration_minute.head()

In [None]:
# Merging duration_hours and duration_minutes
df['Duration_Hour'] = duration_hour.str.split('h').str[0].astype(int)
df['Duration_Minute'] = duration_minute.str.split('m').str[0].astype(int)
# Drop Duration column
df.drop(['Duration'], axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
# Convert categorical variables to numerical variables using label encoding
print(df['Airline'].unique(), df['Source'].unique(), df['Destination'].unique(), df['Additional_Info'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df['Airline'] = le.fit_transform(df['Airline'])
df['Source'] = le.fit_transform(df['Source'])
df['Destination'] = le.fit_transform(df['Destination'])
df['Additional_Info'] = le.fit_transform(df['Additional_Info'])
df.head()

In [None]:
# We can also use OneHotEncoder for categorical variables
# from sklearn.preprocessing import OneHotEncoder

# ohe = OneHotEncoder(sparse=False, drop='first')
# df_airline = ohe.fit_transform(df[['Airline']])
# df_source = ohe.fit_transform(df[['Source']])
# df_destination = ohe.fit_transform(df[['Destination']])
# df_additional_info = ohe.fit_transform(df[['Additional_Info']])
# df_airline = pd.DataFrame(df_airline, columns=ohe.get_feature_names_out(['Airline']))
# df_source = pd.DataFrame(df_source, columns=ohe.get_feature_names_out(['Source']))
# df_destination = pd.DataFrame(df_destination, columns=ohe.get_feature_names_out(['Destination']))
# df_additional_info = pd.DataFrame(df_additional_info, columns=ohe.get_feature_names_out(['Additional_Info']))