# **ANALYSIS OF MACHINE LEARNING MODELS FOR AIRLINE PASSENGER'S SATISFACTION USING PYSPARK**

---
---




We wanted to understand what factors make airline passengers more or less satisfied with their flight experience. By studying this, airlines could improve services and prioritize what passengers care most about.

To do this, we employed four machine learning models (logistic regression, Support Vector Machine, Random Forest and Gradient Boosted Tree) to help predict whether a passenger is satisfied or not based on other related information such as the comfort of the seats, inflight Wi-Fi service, distance traveled, etc. We were also able to determine which factors are most important for passenger satisfaction.



In [1]:
#Importing Data Exploratory libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading the dataset

data = pd.read_csv('airline_passenger_satisfaction.csv')
data.head()

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied


In [3]:
#Checking the number of rows and columns

data.shape

(129880, 24)

In [4]:
#Checking information about the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   ID                                      129880 non-null  int64  
 1   Gender                                  129880 non-null  object 
 2   Age                                     129880 non-null  int64  
 3   Customer Type                           129880 non-null  object 
 4   Type of Travel                          129880 non-null  object 
 5   Class                                   129880 non-null  object 
 6   Flight Distance                         129880 non-null  int64  
 7   Departure Delay                         129880 non-null  int64  
 8   Arrival Delay                           129487 non-null  float64
 9   Departure and Arrival Time Convenience  129880 non-null  int64  
 10  Ease of Online Booking                  1298

In [5]:
#Checking the columns with null values

data.isnull().sum()

ID                                          0
Gender                                      0
Age                                         0
Customer Type                               0
Type of Travel                              0
Class                                       0
Flight Distance                             0
Departure Delay                             0
Arrival Delay                             393
Departure and Arrival Time Convenience      0
Ease of Online Booking                      0
Check-in Service                            0
Online Boarding                             0
Gate Location                               0
On-board Service                            0
Seat Comfort                                0
Leg Room Service                            0
Cleanliness                                 0
Food and Drink                              0
In-flight Service                           0
In-flight Wifi Service                      0
In-flight Entertainment           

In [6]:
#Drop the null values since they are less than 10% of the data

data.dropna(inplace=True)
data.shape

(129487, 24)

In [7]:
#Description statistics of the dataset

data.describe()

Unnamed: 0,ID,Age,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling
count,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0,129487.0
mean,64958.335169,39.428761,1190.210662,14.643385,15.091129,3.057349,2.756786,3.306239,3.25272,2.976909,3.383204,3.441589,3.351078,3.286222,3.204685,3.642373,2.728544,3.358067,3.631886
std,37489.781165,15.117597,997.560954,37.932867,38.46565,1.526787,1.401662,1.266146,1.350651,1.278506,1.287032,1.319168,1.316132,1.313624,1.329905,1.176614,1.329235,1.334149,1.180082
min,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,32494.5,27.0,414.0,0.0,0.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0
50%,64972.0,40.0,844.0,0.0,0.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0
75%,97415.5,51.0,1744.0,12.0,13.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,4.0,5.0
max,129880.0,85.0,4983.0,1592.0,1584.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [8]:
#Checking the number of unique values each column has

data.nunique()

ID                                        129487
Gender                                         2
Age                                           75
Customer Type                                  2
Type of Travel                                 2
Class                                          3
Flight Distance                             3821
Departure Delay                              464
Arrival Delay                                472
Departure and Arrival Time Convenience         6
Ease of Online Booking                         6
Check-in Service                               6
Online Boarding                                6
Gate Location                                  6
On-board Service                               6
Seat Comfort                                   6
Leg Room Service                               6
Cleanliness                                    6
Food and Drink                                 6
In-flight Service                              6
In-flight Wifi Servi

In [9]:
#Dropping outlier data

arr_delay_outlier = data['Arrival Delay'] < 650
dep_delay_outlier = data['Departure Delay'] < 650
flight_dist_outlier = data['Flight Distance'] < 4000
Data = data[arr_delay_outlier & dep_delay_outlier & flight_dist_outlier]
Data.shape

(129376, 24)

**PYSPARK ENVIRONMENT**

In [10]:
#pyspark environment
import findspark
findspark.init()
findspark.find()
import pyspark

In [11]:
#Importing the required spark libraries
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, LinearSVC, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [12]:
from pyspark import SparkContext 
SparkContext.getOrCreate().stop() 

In [13]:
#Initialising the Spark Session
spark = SparkSession.builder.master("local[*]")\
        .appName("AirlineSatisfaction_RF")\
        .config("spark.executor.memory", "4g")\
        .config("spark.driver.memory", "4g")\
        .config("spark.network.timeout", "800s")\
        .config("spark.executor.heartbeatInterval", "100s")\
        .getOrCreate()

In [14]:

#spark = SparkSession.builder.appName('AirlineSatisfaction_RF').getOrCreate()

In [15]:
#Load the RF dataset

New_Data = spark.createDataFrame(Data, verifySchema=True)
New_Data.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Customer Type: string (nullable = true)
 |-- Type of Travel: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Flight Distance: long (nullable = true)
 |-- Departure Delay: long (nullable = true)
 |-- Arrival Delay: double (nullable = true)
 |-- Departure and Arrival Time Convenience: long (nullable = true)
 |-- Ease of Online Booking: long (nullable = true)
 |-- Check-in Service: long (nullable = true)
 |-- Online Boarding: long (nullable = true)
 |-- Gate Location: long (nullable = true)
 |-- On-board Service: long (nullable = true)
 |-- Seat Comfort: long (nullable = true)
 |-- Leg Room Service: long (nullable = true)
 |-- Cleanliness: long (nullable = true)
 |-- Food and Drink: long (nullable = true)
 |-- In-flight Service: long (nullable = true)
 |-- In-flight Wifi Service: long (nullable = true)
 |-- In-flight Entertainment: long (nullable = true)
 |--

In [16]:
#Indexing categorical columns

#Satisfaction column
satisfaction_indexer = StringIndexer(inputCol='Satisfaction', outputCol='label')
New_Data = satisfaction_indexer.fit(New_Data).transform(New_Data)

#Gender column
gender_indexer = StringIndexer(inputCol='Gender', outputCol='Gender_Index')
New_Data = gender_indexer.fit(New_Data).transform(New_Data)

#Customer Type column
customer_type_indexer = StringIndexer(inputCol='Customer Type', outputCol='Customer_Type_Index')
New_Data = customer_type_indexer.fit(New_Data).transform(New_Data)

#Type of Travel column
type_of_travel_indexer = StringIndexer(inputCol='Type of Travel', outputCol='Type_of_Travel_Index')
New_Data = type_of_travel_indexer.fit(New_Data).transform(New_Data)
#Class column
class_indexer = StringIndexer(inputCol='Class', outputCol='Class_Index')
New_Data = class_indexer.fit(New_Data).transform(New_Data)

#Departure and Arrival Time Convenience column
Arrival_Depart_Conv_indexer = StringIndexer(inputCol='Departure and Arrival Time Convenience', outputCol='Arr_Dept_Time_Cov_Index')
New_Data = Arrival_Depart_Conv_indexer.fit(New_Data).transform(New_Data)

#Ease of Online Booking column
Ease_of_Online_Booking_indexer = StringIndexer(inputCol='Ease of Online Booking', outputCol='Ease_of_Online_Booking_Index')
New_Data = Ease_of_Online_Booking_indexer.fit(New_Data).transform(New_Data)

#Check-in Service column
Check_in_Service_indexer = StringIndexer(inputCol='Check-in Service', outputCol='Check_in_Service_Index')
New_Data = Check_in_Service_indexer.fit(New_Data).transform(New_Data)

#Online Boarding column
Online_Boarding_indexer = StringIndexer(inputCol='Online Boarding', outputCol='Online_Boarding_Index')
New_Data = Online_Boarding_indexer.fit(New_Data).transform(New_Data)

#Gate Location column
Gate_Location_indexer = StringIndexer(inputCol='Gate Location', outputCol='Gate_Location_Index')
New_Data = Gate_Location_indexer.fit(New_Data).transform(New_Data)

#On-board Service column
Onboard_Service_indexer = StringIndexer(inputCol='On-board Service', outputCol='Onboard_Service_Index')
New_Data = Onboard_Service_indexer.fit(New_Data).transform(New_Data)

#Seat Comfort column
Seat_Comfort_indexer = StringIndexer(inputCol='Seat Comfort', outputCol='Seat_Comfort_Index')
New_Data = Seat_Comfort_indexer.fit(New_Data).transform(New_Data)

#Leg Rom Service column
Leg_Room_Service_indexer = StringIndexer(inputCol='Leg Room Service', outputCol='Leg_Room_Service_Index')
New_Data = Leg_Room_Service_indexer.fit(New_Data).transform(New_Data)

#Cleanliness column
Cleanliness_indexer = StringIndexer(inputCol='Cleanliness', outputCol='Cleanliness_Index')
New_Data = Cleanliness_indexer.fit(New_Data).transform(New_Data)

#Food and Drink column
Food_and_Drink_indexer = StringIndexer(inputCol='Food and Drink', outputCol='Food_and_Drink_Index')
New_Data = Food_and_Drink_indexer.fit(New_Data).transform(New_Data)

#In-flight Service column
Inflight_Service_indexer = StringIndexer(inputCol='In-flight Service', outputCol='Inflight_Service_Index')
New_Data = Inflight_Service_indexer.fit(New_Data).transform(New_Data)

#In-flight Entertainment column
Inflight_Entertaiment_indexer = StringIndexer(inputCol='In-flight Entertainment', outputCol='Inflight_Entertaiment_Index')
New_Data = Inflight_Entertaiment_indexer.fit(New_Data).transform(New_Data)

#In-flight Wifi Service column
On_board_Wifi_Service_indexer = StringIndexer(inputCol='In-flight Wifi Service', outputCol='On_board_Wifi_Service_Index')
New_Data = On_board_Wifi_Service_indexer.fit(New_Data).transform(New_Data)

#Baggage Handling column
Baggage_Handling_indexer = StringIndexer(inputCol='Baggage Handling', outputCol='Baggage_Handling_Index')
New_Data = Baggage_Handling_indexer.fit(New_Data).transform(New_Data)

In [17]:
#A peep into the new columns

New_Data.columns

['ID',
 'Gender',
 'Age',
 'Customer Type',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Departure Delay',
 'Arrival Delay',
 'Departure and Arrival Time Convenience',
 'Ease of Online Booking',
 'Check-in Service',
 'Online Boarding',
 'Gate Location',
 'On-board Service',
 'Seat Comfort',
 'Leg Room Service',
 'Cleanliness',
 'Food and Drink',
 'In-flight Service',
 'In-flight Wifi Service',
 'In-flight Entertainment',
 'Baggage Handling',
 'Satisfaction',
 'label',
 'Gender_Index',
 'Customer_Type_Index',
 'Type_of_Travel_Index',
 'Class_Index',
 'Arr_Dept_Time_Cov_Index',
 'Ease_of_Online_Booking_Index',
 'Check_in_Service_Index',
 'Online_Boarding_Index',
 'Gate_Location_Index',
 'Onboard_Service_Index',
 'Seat_Comfort_Index',
 'Leg_Room_Service_Index',
 'Cleanliness_Index',
 'Food_and_Drink_Index',
 'Inflight_Service_Index',
 'Inflight_Entertaiment_Index',
 'On_board_Wifi_Service_Index',
 'Baggage_Handling_Index']

In [18]:
#Extracting the indexed categorical columns with the numerical columns as relevant features

feature_columns = ['Age', 'Flight Distance', 'Departure Delay', 'Arrival Delay', 'Gender_Index',
                   'Customer_Type_Index', 'Type_of_Travel_Index','Class_Index', 'Arr_Dept_Time_Cov_Index',
                   'Ease_of_Online_Booking_Index','Check_in_Service_Index', 'Online_Boarding_Index',
                   'Gate_Location_Index', 'Onboard_Service_Index', 'Seat_Comfort_Index', 'Leg_Room_Service_Index',
                   'Cleanliness_Index', 'Food_and_Drink_Index', 'Inflight_Service_Index',
                   'Inflight_Entertaiment_Index', 'On_board_Wifi_Service_Index', 'Baggage_Handling_Index']

In [19]:
#A peep into the first 5 rows of the feature columns

New_Data.select(feature_columns).show(5)

+---+---------------+---------------+-------------+------------+-------------------+--------------------+-----------+-----------------------+----------------------------+----------------------+---------------------+-------------------+---------------------+------------------+----------------------+-----------------+--------------------+----------------------+---------------------------+---------------------------+----------------------+
|Age|Flight Distance|Departure Delay|Arrival Delay|Gender_Index|Customer_Type_Index|Type_of_Travel_Index|Class_Index|Arr_Dept_Time_Cov_Index|Ease_of_Online_Booking_Index|Check_in_Service_Index|Online_Boarding_Index|Gate_Location_Index|Onboard_Service_Index|Seat_Comfort_Index|Leg_Room_Service_Index|Cleanliness_Index|Food_and_Drink_Index|Inflight_Service_Index|Inflight_Entertaiment_Index|On_board_Wifi_Service_Index|Baggage_Handling_Index|
+---+---------------+---------------+-------------+------------+-------------------+--------------------+-----------+-

In [20]:
#Transforming the feature columns using Vector Assembler

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
output = assembler.transform(New_Data)
output.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[48.0,821.0,2.0,5...|
|[35.0,821.0,26.0,...|
|[41.0,853.0,0.0,0...|
|(22,[0,1,4,8,9,10...|
|(22,[0,1,3,8,10,1...|
+--------------------+
only showing top 5 rows



In [21]:
#Extracting the transformed features column and label column for the classification

final_data = output.select("features", "label")
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [22]:
#Defining Evaluation Metrics for the Models
auroc_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
aupr_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName= "weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")


**LOGISTIC REGRESSION MODEL**

In [23]:
#Copying transformed dataset for the for the Logistic Regression

lr_data = final_data
lr_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [24]:
#Splitting the lr_data into training and test sets

train_data_lr, test_data_lr = lr_data.randomSplit([0.7, 0.3], seed=20)

In [25]:
#Initialising the Logistic Regression classifeir

lr = LogisticRegression(featuresCol="features", labelCol="label")

In [26]:
#Trainning the lr model

lr_model = lr.fit(train_data_lr)

In [27]:
#Predictions with the lr model on the test set

lr_predictions = lr_model.transform(test_data_lr)

In [28]:
#Evaluating the lr model

lr_accuracy = accuracy_evaluator.evaluate(lr_predictions)
lr_weightedPrecision = precision_evaluator.evaluate(lr_predictions)
lr_weightedRecall = recall_evaluator.evaluate(lr_predictions)
lr_f1 = f1_evaluator.evaluate(lr_predictions)

print(f"Logistic Regression Model Accuracy: {lr_accuracy: .2f}")
print(f"Logistic Regression Model Weighted Precision: {lr_weightedPrecision: .2f}")
print(f"Logistic Regression Model Weighted Recall: {lr_weightedRecall: .2f}")
print(f"Logistic Regression Model f1: {lr_f1: .2f}")

Logistic Regression Model Accuracy:  0.87
Logistic Regression Model Weighted Precision:  0.87
Logistic Regression Model Weighted Recall:  0.87
Logistic Regression Model f1:  0.87


In [29]:
#Preview first three rows of the lr prediction

lr_predicted= lr_predictions.select("label", "prediction", "features")
lr_predicted.show(3)

+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  0.0|       1.0|(22,[0,1,2,3,4,5,...|
|  1.0|       0.0|(22,[0,1,2,3,4,5,...|
+-----+----------+--------------------+
only showing top 3 rows



In [30]:
#Extract the logistic coefficients

lr_coefficients = lr_model.coefficients.toArray()

#Matching features to their absolute coefficients

lr_feature_importance = list(zip(feature_columns, abs(lr_coefficients)))

#Sort the feature importances in descending order

sorted_lr_importances = sorted(lr_feature_importance, key=lambda x: x[1], reverse=True)


In [31]:
lr_features_ranking = pd.DataFrame(sorted_lr_importances, columns=['Feature', 'LR_coefficient'])
lr_features_ranking

Unnamed: 0,Feature,LR_coefficient
0,Type_of_Travel_Index,2.878071
1,Customer_Type_Index,2.268346
2,On_board_Wifi_Service_Index,0.950984
3,Class_Index,0.806763
4,Online_Boarding_Index,0.339085
5,Leg_Room_Service_Index,0.334449
6,Onboard_Service_Index,0.277889
7,Inflight_Entertaiment_Index,0.268852
8,Check_in_Service_Index,0.195263
9,Gate_Location_Index,0.174557


***Applying Hyperparameter tunning to the Logistic Regression Classifer***

In [32]:
#Parameter grid for the logistic regression Hyperparameter tunning

lr_paramGrid = (ParamGridBuilder()
            .addGrid(lr.regParam, [0.1, 0.01])
            .addGrid(lr.maxIter, [50, 100])
            .build())

In [33]:
# Defining the Cross-validator for the logistic regression

lr_crossval = CrossValidator(estimator=lr,
                             estimatorParamMaps=lr_paramGrid,
                             evaluator=accuracy_evaluator, numFolds=5)

In [34]:
#Cross-validation fitting for the lr model

lr_cv_model = lr_crossval.fit(train_data_lr)

In [35]:
# Extracting lr_cv Best model

lr_best_model = lr_cv_model.bestModel

In [36]:
#Predictions with the SVM CV on the test data

lr_cv_predictions = lr_best_model.transform(test_data_lr)

In [37]:
#Evaluating the lr_cv model

lr_cv_accuracy = accuracy_evaluator.evaluate(lr_cv_predictions)
lr_cv_weightedPrecision = precision_evaluator.evaluate(lr_cv_predictions)
lr_cv_weightedRecall = recall_evaluator.evaluate(lr_cv_predictions)
lr_cv_f1 = f1_evaluator.evaluate(lr_cv_predictions)

print(f"CV Logistic Regression Model Accuracy: {lr_cv_accuracy: .2f}")
print(f"CV Logistic Regression Model Weighted Precision: {lr_cv_weightedPrecision: .2f}")
print(f"CV Logistic Regression Model Weighted Recall: {lr_cv_weightedRecall: .2f}")
print(f"CV Logistic Regression Model f1: {lr_cv_f1: .2f}")

CV Logistic Regression Model Accuracy:  0.87
CV Logistic Regression Model Weighted Precision:  0.87
CV Logistic Regression Model Weighted Recall:  0.87
CV Logistic Regression Model f1:  0.87


In [38]:
#Preview first three rows of the lr_Cv prediction

cv_lr_predicted= lr_predictions.select("label", "prediction", "features")
cv_lr_predicted.show(3)

+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  0.0|       1.0|(22,[0,1,2,3,4,5,...|
|  1.0|       0.0|(22,[0,1,2,3,4,5,...|
+-----+----------+--------------------+
only showing top 3 rows



In [39]:
#Extract the logistic coefficients

lr_cv_coefficients = lr_best_model.coefficients.toArray()

#Matching features to their absolute coefficients

lr_cv_feature_importance = list(zip(feature_columns, abs(lr_cv_coefficients)))

#Sort the feature importances in descending order

sorted_lr_cv_importances = sorted(lr_cv_feature_importance, key=lambda x: x[1], reverse=True)

In [40]:
cv_lr_feature_ranking = pd.DataFrame(sorted_lr_cv_importances, columns=['Feature', 'CV_LR_coefficient'])
cv_lr_feature_ranking

Unnamed: 0,Feature,CV_LR_coefficient
0,Type_of_Travel_Index,2.324921
1,Customer_Type_Index,1.767525
2,On_board_Wifi_Service_Index,0.768404
3,Class_Index,0.748889
4,Leg_Room_Service_Index,0.289657
5,Online_Boarding_Index,0.273814
6,Inflight_Entertaiment_Index,0.248401
7,Onboard_Service_Index,0.235501
8,Check_in_Service_Index,0.157339
9,Ease_of_Online_Booking_Index,0.153196


**LINEAR SUPPORT VECTOR MACHINE (SVM) MODEL**

In [41]:
#Copying transformed dataset for the SVM model

svm_data = final_data
svm_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [42]:
#Splitting the svm_data into training and test sets

train_data_svm, test_data_svm = svm_data.randomSplit([0.7, 0.3], seed=20)

In [43]:
#Initialising the Linear SVM classifeir

svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=100, regParam=0.01)

In [44]:
#Trainning the SVM model

svm_model = svm.fit(train_data_svm)

In [45]:
#Predictions with the SVM model on the test set

svm_predictions = svm_model.transform(test_data_svm)

In [46]:
#Evaluating the SVM model

svm_accuracy = accuracy_evaluator.evaluate(svm_predictions)
svm_weightedPrecision = precision_evaluator.evaluate(svm_predictions)
svm_weightedRecall = recall_evaluator.evaluate(svm_predictions)
svm_f1 = f1_evaluator.evaluate(svm_predictions)

print(f"SVM Model Accuracy: {svm_accuracy: .2f}")
print(f"SVM Model Weighted Precision: {svm_weightedPrecision: .2f}")
print(f"SVM Model Weighted Recall: {svm_weightedRecall: .2f}")
print(f"SVM Model f1: {svm_f1: .2f}")

SVM Model Accuracy:  0.87
SVM Model Weighted Precision:  0.87
SVM Model Weighted Recall:  0.87
SVM Model f1:  0.87


In [47]:
#Preview first three rows of the svm prediction

svm_predicted= svm_predictions.select("label", "prediction", "features")
svm_predicted.show(3)

+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  0.0|       1.0|(22,[0,1,2,3,4,5,...|
|  1.0|       0.0|(22,[0,1,2,3,4,5,...|
+-----+----------+--------------------+
only showing top 3 rows



In [48]:
#Extract the SVM coefficients

svm_coefficients = svm_model.coefficients.toArray()

#Matching features to their absolute coefficients

svm_feature_importance = list(zip(feature_columns, abs(svm_coefficients)))

#Sort the feature importances in descending order

sorted_svm_importances = sorted(svm_feature_importance, key=lambda x: x[1], reverse=True)

In [49]:
svm_feature_ranking = pd.DataFrame(sorted_svm_importances, columns=['Feature', 'SVM_coefficient'])
svm_feature_ranking

Unnamed: 0,Feature,SVM_coefficient
0,Type_of_Travel_Index,1.808478
1,Customer_Type_Index,1.438164
2,On_board_Wifi_Service_Index,0.532586
3,Class_Index,0.518739
4,Inflight_Entertaiment_Index,0.243146
5,Leg_Room_Service_Index,0.199133
6,Online_Boarding_Index,0.184856
7,Onboard_Service_Index,0.173198
8,Check_in_Service_Index,0.106209
9,Seat_Comfort_Index,0.105356


***Applying hyperparameter tunning to the Linear SVM Classifer***

In [50]:
#Parameter grid for the SVM Hyperparameter tunning

svm_param_grid = (ParamGridBuilder()
              .addGrid(svm.maxIter, [50, 100, 200])
              .addGrid(svm.regParam, [0.01, 0.1, 1.0])
              .build())

In [51]:
# Defining the Cross-validator for the SVM
svm_crossval = CrossValidator(estimator=svm,
                          estimatorParamMaps=svm_param_grid,
                          evaluator=accuracy_evaluator,
                          numFolds=3)

In [52]:
#Cross-validation fitting for the svm model

svm_cv_model = svm_crossval.fit(train_data_svm)

In [53]:
# Extracting SVM Best model

svm_best_model = svm_cv_model.bestModel

In [54]:
#Predictions with the SVM CV on the test data

svm_cv_predictions = svm_best_model.transform(test_data_svm)

In [55]:
#Evaluating the cv SVM model

svm_cv_accuracy = accuracy_evaluator.evaluate(svm_cv_predictions)
svm_cv_weightedPrecision = precision_evaluator.evaluate(svm_cv_predictions)
svm_cv_weightedRecall = recall_evaluator.evaluate(svm_cv_predictions)
svm_cv_f1 = f1_evaluator.evaluate(svm_cv_predictions)

print(f"CV SVM Model Accuracy: {svm_cv_accuracy: .2f}")
print(f"CV SVM Model Weighted Precision: {svm_cv_weightedPrecision: .2f}")
print(f"CV SVM Model Weighted Recall: {svm_cv_weightedRecall: .2f}")
print(f"CV SVM Model f1: {svm_cv_f1: .2f}")

CV SVM Model Accuracy:  0.87
CV SVM Model Weighted Precision:  0.87
CV SVM Model Weighted Recall:  0.87
CV SVM Model f1:  0.87


In [56]:
#Preview first three rows of the SVM_cv prediction

cv_svm_predicted= svm_predictions.select("label", "prediction", "features")
cv_svm_predicted.show(3)

+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  0.0|       1.0|(22,[0,1,2,3,4,5,...|
|  1.0|       0.0|(22,[0,1,2,3,4,5,...|
+-----+----------+--------------------+
only showing top 3 rows



In [57]:
#Extract the CV_SVM coefficients

svm_cv_coefficients = svm_best_model.coefficients.toArray()

#Matching features to their absolute coefficients

svm_cv_feature_importance = list(zip(feature_columns, abs(svm_cv_coefficients)))

#Sort the feature importances in descending order

sorted_svm_cv_importances = sorted(svm_cv_feature_importance, key=lambda x: x[1], reverse=True)

In [58]:
cv_svm_feature_ranking = pd.DataFrame(sorted_svm_cv_importances, columns=['Feature', 'CV_SVM_coefficient'])
cv_svm_feature_ranking

Unnamed: 0,Feature,CV_SVM_coefficient
0,Type_of_Travel_Index,1.808478
1,Customer_Type_Index,1.438164
2,On_board_Wifi_Service_Index,0.532586
3,Class_Index,0.518739
4,Inflight_Entertaiment_Index,0.243146
5,Leg_Room_Service_Index,0.199133
6,Online_Boarding_Index,0.184856
7,Onboard_Service_Index,0.173198
8,Check_in_Service_Index,0.106209
9,Seat_Comfort_Index,0.105356


**RANDOM FOREST MODEL**

In [59]:
#Copying transformed dataset for the Logistic Regression

rf_data = final_data
rf_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [60]:
#Split the data into training and test sets

train_data_rf, test_data_rf = rf_data.randomSplit([0.7, 0.3], seed=20)

In [61]:
#Initialising Random Forest Model

rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=100)

In [62]:
#Training the Model

rf_model = rf.fit(train_data_rf)

In [63]:
#The prediction on the test data

rf_predictions = rf_model.transform(test_data_rf)

In [64]:
#Evaluating the random forest model

rf_accuracy = accuracy_evaluator.evaluate(rf_predictions)
rf_weightedPrecision = precision_evaluator.evaluate(rf_predictions)
rf_weightedRecall = recall_evaluator.evaluate(rf_predictions)
rf_f1 = f1_evaluator.evaluate(rf_predictions)

print(f"Random Forest Model Accuracy: {rf_accuracy: .2f}")
print(f"Random Forest Model Weighted Precision: {rf_weightedPrecision: .2f}")
print(f"Random Forest Model Weighted Recall: {rf_weightedRecall: .2f}")
print(f"Random Forest Model f1: {rf_f1: .2f}")

Random Forest Model Accuracy:  0.93
Random Forest Model Weighted Precision:  0.93
Random Forest Model Weighted Recall:  0.93
Random Forest Model f1:  0.93


In [65]:
#Preview first three rows of the rf prediction

rf_predicted= rf_predictions.select("label", "prediction", "features")
rf_predicted.show(3)

+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  0.0|       0.0|(22,[0,1,2,3,4,5,...|
|  1.0|       0.0|(22,[0,1,2,3,4,5,...|
+-----+----------+--------------------+
only showing top 3 rows



In [66]:
#Extract the feature importances

rf_importances = rf_model.featureImportances

#Matching features to their importances

rf_feature_importances = list(zip(feature_columns, rf_importances))

#Sort the feature importances in descending order

sorted_rf_importances = sorted(rf_feature_importances, key=lambda x: x[1], reverse=True)

In [67]:
rf_feature_ranking = pd.DataFrame(sorted_rf_importances, columns=['Feature', 'RF_Importance'])
rf_feature_ranking

Unnamed: 0,Feature,RF_Importance
0,Online_Boarding_Index,0.268478
1,On_board_Wifi_Service_Index,0.146298
2,Class_Index,0.144601
3,Type_of_Travel_Index,0.127579
4,Inflight_Entertaiment_Index,0.076331
5,Leg_Room_Service_Index,0.039422
6,Seat_Comfort_Index,0.036813
7,Ease_of_Online_Booking_Index,0.035299
8,Onboard_Service_Index,0.029312
9,Customer_Type_Index,0.022768


***Applying Hyperparameter tunning to the Random Forest***

In [68]:
#Defining the

rf_param_grid = (ParamGridBuilder()
              .addGrid(rf.numTrees, [50, 100, 150])
              .addGrid(rf.maxDepth, [5, 10, 15])
              .addGrid(rf.maxBins, [32, 64])
              .build())

In [69]:
#The cross-validator

rf_crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=rf_param_grid,
                          evaluator=accuracy_evaluator,
                          numFolds=3)

In [70]:
#Training the model

rf_cv_model = rf_crossval.fit(train_data_rf)

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 62157)
Traceback (most recent call last):
  File "C:\Users\eniha\anaconda3\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "C:\Users\eniha\anaconda3\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "C:\Users\eniha\anaconda3\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "C:\Users\eniha\anaconda3\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\Users\eniha\anaconda3\Lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "C:\Users\eniha\anaconda3\Lib\site-packages\pyspark\accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "C:\Users\eniha\anaconda3\Lib\site-packages\p

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
#The best model

best_rf_model = rf_cv_model.bestModel

#Predictions on the test data

rf_cv_predictions = best_rf_model.transform(test_data_rf)

In [None]:
#Evaluating the cv random froest model

rf_cv_accuracy = accuracy_evaluator.evaluate(rf_cv_predictions)
rf_cv_weightedPrecision = precision_evaluator.evaluate(rf_cv_predictions)
rf_cv_weightedRecall = recall_evaluator.evaluate(rf_cv_predictions)
rf_cv_f1 = f1_evaluator.evaluate(rf_cv_predictions)

print(f"CV RF Model Accuracy: {rf_cv_accuracy: .2f}")
print(f"CV RF Model Weighted Precision: {rf_cv_weightedPrecision: .2f}")
print(f"CV RF Model Weighted Recall: {rf_cv_weightedRecall: .2f}")
print(f"CV RF Model f1: {rf_cv_f1: .2f}")

In [None]:
#Preview first three rows of the svm prediction

cv_rf_predicted= rf_cv_predictions.select("label", "prediction", "features")
cv_rf_predicted.show(3)

In [None]:
#Extract feature importances
cv_rf_feature_importances = best_rf_model.featureImportances

#Matching features to their importances
cv_rf_importances = list(zip(feature_columns, cv_rf_feature_importances))

#Sort the feature importances in descending order
sorted_cv_rf_importances = sorted(cv_rf_importances, key=lambda x: x[1], reverse=True)

In [None]:
cv_rf_features_ranking = pd.DataFrame(sorted_cv_rf_importances, columns=['Feature', 'CV_RF_Importance'])
cv_rf_features_ranking

**GRADIENT-BOOSTED TREE (GBT) CLASSIFIER MODEL**

In [None]:
#Copying transformed dataset for the GBT model

gbt_data = final_data
gbt_data.printSchema()

In [None]:
#Split the data into training and test sets

train_data_gbt, test_data_gbt = gbt_data.randomSplit([0.7, 0.3], seed=20)

In [None]:
# Initialize Gradient-Boosted Trees classifier

gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=50)

In [None]:
# Trainning the GBT model

gbt_model = gbt.fit(train_data_gbt)

In [None]:
#Prediction on the test set

gbt_predictions = gbt_model.transform(test_data_gbt)

In [None]:
#Evaluating the GBT model

gbt_accuracy = accuracy_evaluator.evaluate(gbt_predictions)
gbt_weightedPrecision = precision_evaluator.evaluate(gbt_predictions)
gbt_weightedRecall = recall_evaluator.evaluate(gbt_predictions)
gbt_f1 = f1_evaluator.evaluate(gbt_predictions)

print(f"Gradient Boost Tree Model Accuracy: {gbt_accuracy: .2f}")
print(f"Gradient Boost Tree Model Weighted Precision: {gbt_weightedPrecision: .2f}")
print(f"Gradient Boost Tree Model Weighted Recall: {gbt_weightedRecall: .2f}")
print(f"Gradient Boost Tree Model f1: {gbt_f1: .2f}")

In [None]:
#Preview first three rows of the GBT prediction

gbt_predicted= gbt_predictions.select("label", "prediction", "features")
gbt_predicted.show(3)

In [None]:
# Extracting feature importances from GBT model
gbt_importances = gbt_model.featureImportances.toArray()

# Combine with feature names
gbt_feature_importance = list(zip(feature_columns, gbt_importances))

#Sorting by importances
gbt_sorted_importance = sorted(gbt_feature_importance, key=lambda x: x[1], reverse=True)

In [None]:
gbt_feature_ranking = pd.DataFrame(gbt_sorted_importance, columns=['Feature', 'GBT_Importance'])
gbt_feature_ranking

*Hyperparameter tunning for the Gradient Boosted Tree Classifer*

In [None]:
# Parameter grid for the GBT hyperparameter tuning

gbt_param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxIter, [10, 50, 100])
              .addGrid(gbt.maxDepth, [3, 5, 7])
              .addGrid(gbt.stepSize, [0.1, 0.2])
              .build())

In [None]:
# Defining the Cross-validator for the GBT

gbt_crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=gbt_param_grid,
                          evaluator=accuracy_evaluator,
                          numFolds=3)

In [None]:
#Performing the GBT cross-validation

gbt_cv_model = gbt_crossval.fit(train_data_gbt)

In [None]:
# Extracting GBT Best model

gbt_best_model = gbt_cv_model.bestModel

#Predictions on the test data

gbt_cv_predictions = gbt_best_model.transform(test_data_gbt)

In [None]:
#Evaluating the cv GBT model

gbt_cv_accuracy = accuracy_evaluator.evaluate(gbt_cv_predictions)
gbt_cv_weightedPrecision = precision_evaluator.evaluate(gbt_cv_predictions)
gbt_cv_weightedRecall = recall_evaluator.evaluate(gbt_cv_predictions)
gbt_cv_f1 = f1_evaluator.evaluate(gbt_cv_predictions)

print(f"CV GBT Model Accuracy: {gbt_cv_accuracy: .2f}")
print(f"CV GBT Model Weighted Precision: {gbt_cv_weightedPrecision: .2f}")
print(f"CV GBT Model Weighted Recall: {gbt_cv_weightedRecall: .2f}")
print(f"CV GBT Model f1: {gbt_cv_f1: .2f}")

In [None]:
#Preview first three rows of the GBT prediction

cv_gbt_predicted= gbt_cv_predictions.select("label", "prediction", "features")
cv_gbt_predicted.show(3)

In [None]:
# Extracting feature importances from tuned GBT model
cv_gbt_importances = gbt_best_model.featureImportances.toArray()

# Combine with feature names
cv_gbt_feature_importance = list(zip(feature_columns, cv_gbt_importances))

#Sorting by importances
cv_gbt_sorted_importance = sorted(cv_gbt_feature_importance, key=lambda x: x[1], reverse=True)

In [None]:
cv_gbt_feature_ranking = pd.DataFrame(cv_gbt_sorted_importance, columns=['Feature', 'GBT_CV_Importance'])
cv_gbt_feature_ranking