In [15]:
#A critical part of the successful Machine Learning project is coming up
#with a good set of features to train on. This process is called feature engineering,
#and it involves three steps: feature transformation (transforming the original features), 
#feature selection (selecting the most useful features to train on), and feature extraction
#(combining existing features to produce more useful ones). In this notebook we will explore
#different tools in Feature Engineering.

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
#import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [18]:
data = pd.read_excel("Data/airlines_data.xlsx")
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [20]:
data.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [21]:
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [22]:
#we have found some null points, we need to either remove them from our dataset or fill 
#them with something else

#we will use fillna() and method='ffill', which fills the last observed non-null 
#value forward until another non-null value is encountered.

In [23]:
data = data.fillna(method = 'ffill')

In [24]:
#Feature Transformation

#Feature Transformation means transforming our features to the functions of the original 
#features. For example, feature encoding, scaling, and discretization (the process
#of transforming continuous variables into discrete form, by creating bins or intervals)
#are the most common forms of data transformation.

In [25]:
#Dealing with Categorical Variables

In [26]:
#Categorical variables represent qualitative data with no apparent inherent
#mathematical meaning. Therefore, for any machine learning analysis, all the
#categorical data must be transformed into the numerical data types.

In [27]:
#First, we'll start with 'Airlines' column, as it contains categorical values. 
#We will use unique() method to obtain all the categories in this column.

In [28]:
data['Airline'].unique().tolist()

['IndiGo',
 'Air India',
 'Jet Airways',
 'SpiceJet',
 'Multiple carriers',
 'GoAir',
 'Vistara',
 'Air Asia',
 'Vistara Premium economy',
 'Jet Airways Business',
 'Multiple carriers Premium economy',
 'Trujet']

In [29]:
#From the above list, we notice that some of the airline names are being repeated.
#For example, 'Jet Airways' and 'Jet Airways Business'. This means that some of the airlines are subdivided into separate parts. We will combine these 'two-parts' airlines to make our categorical features more consistent with the rest of the variables.

#Here, we will use the numpy where() function to locate and combine the two categories.

In [30]:
data['Airline'] = np.where(data['Airline']=='Vistara Premium economy', 'Vistara', data['Airline'])
data['Airline'] = np.where(data['Airline']=='Jet Airways Business', 'Jet Airways', data['Airline'])

In [31]:
#One Hot Encoding

#Now, to be recognized by a machine learning algorithms, our categorical variables
#should be converted into numerical ones. One way to do this is through one hot encoding. 

In [32]:
#We will use, get_dummies() method to do this transformation. In the next cell, we will 
#transform 'Airline', 'Source', and 'Destination' into their respective numeric variables. 
#We will put all the transformed data into a 'data1' data frame.

In [33]:
data1 = pd.get_dummies(data = data, columns = ['Airline', 'Source', 'Destination'])
data1.head()

Unnamed: 0,Date_of_Journey,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Airline_Air Asia,Airline_Air India,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,24/03/2019,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,False,False,...,False,False,False,False,False,False,False,False,False,True
1,1/05/2019,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,False,True,...,False,False,True,False,True,False,False,False,False,False
2,9/06/2019,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,False,False,...,False,True,False,False,False,True,False,False,False,False
3,12/05/2019,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,False,False,...,False,False,True,False,True,False,False,False,False,False
4,01/03/2019,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,False,False,...,False,False,False,False,False,False,False,False,False,True


In [34]:
#Below, we will compare our original data frame with the transformed one.
data.shape


(10683, 11)

In [35]:
data1.shape

(10683, 29)

In [36]:
#Label Encoding
#Since 'Total_Stops' is originally a categorical data type, 
#we also need to convert it into numerical one. For this, we can perform
#a label encoding, where values are manually assigned to the corresponding keys,
#like "0" to a "non-stop", using the replace() function.


In [37]:
data1.replace({
    "non-stop": 0, "1 stop": 1, "2 stops":2, "3 stops": 3, "4 stops": 4}, inplace =True)
data1.head()

Unnamed: 0,Date_of_Journey,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Airline_Air Asia,Airline_Air India,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Banglore,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,24/03/2019,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,False,False,...,False,False,False,False,False,False,False,False,False,True
1,1/05/2019,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,False,True,...,False,False,True,False,True,False,False,False,False,False
2,9/06/2019,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,13882,False,False,...,False,True,False,False,False,True,False,False,False,False
3,12/05/2019,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,False,False,...,False,False,True,False,True,False,False,False,False,False
4,01/03/2019,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,False,False,...,False,False,False,False,False,False,False,False,False,True


In [38]:
#Feature Selection
#Here, we will select only those attributes which best explain the relationship of the 
#independent variables with respect to the target variable, 'price'. There are many 
#methods for feature selection, building the heatmap and calculating the correlation 
#coefficients scores are the most commonly used ones.

#First, we will select only the relevant and newly transformed variables 
#(and exclude variables such as 'Route', 'Additional_Info', and all the original 
#categorical variables), and place them into a 'new_data' data frame.

In [39]:
#will print all of our data1 columns.
data1.columns

Index(['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration',
       'Total_Stops', 'Additional_Info', 'Price', 'Airline_Air Asia',
       'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Source_Banglore',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Banglore', 'Destination_Cochin', 'Destination_Delhi',
       'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi'],
      dtype='object')

In [40]:
#will construct a heatmap(), using the seaborn library with a newly formed data frame, 'new_data'.
new_data = data1.loc[:,['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration',
       'Total_Stops', 'Additional_Info', 'Price', 'Airline_Air Asia',
       'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Source_Banglore',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Banglore', 'Destination_Cochin', 'Destination_Delhi',
       'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi']]

In [41]:
plt.figure(figsize=(18,18))
sns.heatmap(new_data.corr(), annot=True, cmap='RdYlGn')
plt.show()

NameError: name 'sns' is not defined

<Figure size 1800x1800 with 0 Axes>

In [42]:
#From the heatmap above, extreme green means highly positively correlated features 
#(relationship between two variables in which both variables move in the same direction),
#extreme red means negatively correlated features (relationship between two variables in 
#which an increase in one variable is associated with a decrease in the other).

In [43]:
#Now, we can use the `corr()` function to calculate and list the correlation between 
#all independent variables and the 'price'.

In [44]:
features = new_data.corr()['Price'].sort_values()
features

ValueError: could not convert string to float: '24/03/2019'

In [None]:
#We can also plot these correlation coefficients for easier visualization.


In [None]:
features.plot(kind='bar',figsize=(10,8))

In [45]:
#From the graph above, we can deduct some of the highly correlated features and select
#only those ones for any future analysis.


In [46]:
## **Feature Extraction using Principal Component Analysis (Optional)**


In [47]:
### **PCA with Scikit-Learn**


In [48]:
#Dimentionality reduction is part of the feature extraction process that combines the 
#existing features to produce more useful ones. The goal of dimensionality reduction is 
#to simplify the data without loosing too much information. Principal Component Analysis
#(PCA) is one of the most popular dimensionality reduction algorithms. First, it identifies
#the hyperplane that lies closest to the data, and then it projects the data onto it. In
#this way, a few multidimensional features are merged into one.

#In the following portion of the lab, we will use `scikit-learn` library to perform some PCA 
#on our data.

In [49]:
#First, we must scale our data using the `StandardScaler()` function.
#We will assign all the independent variables to x, and the dependent variable, 
#'price', to y.


In [51]:
x = data1.loc[:,['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration',
       'Total_Stops', 'Additional_Info', 'Price', 'Airline_Air Asia',
       'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Source_Banglore',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Banglore', 'Destination_Cochin', 'Destination_Delhi',
       'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi']]

In [52]:
y= data1.Price

In [53]:
scaler = StandardScaler()
x=scaler.fit_transform(x.astype(np.float64))
x

ValueError: could not convert string to float: '24/03/2019'

In [54]:
#Once the data is scaled, we can apply the `fit_transform()` function
#to reduce the dimensionality of the dataset down to two dimensions.


In [55]:
pca = PCA(n_components = 2)
pca.fit_transform(x)

ValueError: could not convert string to float: '24/03/2019'