In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
# Config plt.
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.style.use('ggplot')
rcParams['figure.figsize'] = 8, 5

# Ignore warnings.
import warnings
warnings.filterwarnings("ignore")

In [8]:
df = pd.read_excel('../data/Data_Train.xlsx')

# Flights Price Machine Learning
## Description
Flight tickets price prediction using different machine learning models such as K-nearest neighbor, decision tree & linear model.
## Data
The dataset contains flight tickets price from different airline companies along with other features such as the flight destination, and duration.
| Feature        |  Description                                       |
|   :---:        |     :---:                                          |
| Price(target)  | The price of the ticket.                           |
| Airline        | The airline name.                                  |
| Source         | The source from which the service begins.          |
| Destination    | The destination where the service ends.            |
| Dep_Time       | The time when the journey starts from the source.  |
| Arrival_Time   | The arrival time at the destination.               |
| Duration       | The total flight duration.                         |
| Total_Stops    | The total stops between the source and destination.|
| Additional_Info| Any additional information about the flight.       |
|Date_of_Journey | The date of the journey.                           |
|Route           | The route taken by the flight to reach the destination.|

<br>

| Number of Entries |  Number of Features  |
|   :---:           |     :---:            |
|   10683           |           11         |


### Data Cleaning

- Extracted Year, Month, Day form "Date_of_Juourney"
- Dropped "year" since its value is the same for all entires
- Dropped "date_of_journey"
- Converted "Duration" to seconds
- Modified "additional_info" value "No Info" to "No info"
- Dropped messing values
- Dropped duplicated entries (kept one)
- Converted "total_stops" from string to numeric
- Categorize "dep_time"and "arrival_time" (i.e. "mid_night", "moring", "afternoon", "evening")
- Dropped "route" since we will be using "total_stops"

In [9]:
# Modify feature names to be lower cased.
df.columns = df.columns.str.lower()

# Change "data_of_journey" type to datetime.
df.date_of_journey = pd.to_datetime(df.date_of_journey, infer_datetime_format=True)

# Create "year", "month", and "day" features.
df['year'] = pd.DatetimeIndex(df.date_of_journey).year
df['month'] = pd.DatetimeIndex(df.date_of_journey).month
df['day'] = pd.DatetimeIndex(df.date_of_journey).day

# Convert "duration to seconds"
df.duration = df.duration.apply(lambda x: f'{x} 0m' if not 'm' in x else x)
df.duration = df.duration.apply(lambda x: f'0h {x}' if not 'h' in x else x)
df.duration = df.duration.apply(lambda x: int(x.split()[0][:-1])*3600 + int(x.split()[1][:-1])*60)

# Drop "year" since its value is the same for all entires.
df.drop(columns=['year'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

# Modify "additional_info" value "No Infor" to "No info"
df.additional_info = df.additional_info.str.replace('No Info', 'No info')

# Drop Null values.
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Drop douplicated entires.
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Modify "total_stops" values to contain only the number.
df.total_stops = df.total_stops.apply(lambda x: int(x.split()[0]) if not 'non-stop' in x.split()[0] else 0)

# Categorize "dep_time"and "arrival_time".
def categorizee_time(input):
    time = input.split()[0]
    if int(time[:2]) >= 0 and int(time[:2]) < 6:
        return 'mid_night'
    elif int(time[:2]) >= 6 and int(time[:2]) < 12:
        return 'morning'
    elif int(time[:2]) >= 12 and int(time[:2]) < 18:
        return 'afternoon'
    elif int(time[:2]) >= 18 and int(time[:2]) < 24:
        return 'evening'

df.dep_time = df.dep_time.apply(categorizee_time)
df.arrival_time = df.arrival_time.apply(categorizee_time)

# Drop "date_of_journey" and "route".
df.drop(columns=['date_of_journey', 'route'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
# Show a smaple of the cleaned dataset.
df.sample(2)

Unnamed: 0,airline,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price,month,day
6961,Jet Airways,Delhi,Cochin,morning,evening,29100,2,No info,11360,3,21
6325,Jet Airways,Delhi,Cochin,evening,evening,77700,2,In-flight meal not included,10588,6,6
