In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Dataset Import

In [2]:
df_train = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/train.csv', index_col=0)
df_test = pd.read_csv('/kaggle/input/airline-passenger-satisfaction/test.csv', index_col=0)

**Gender**: Gender of the passengers (Female, Male)

**Customer Type**: The customer type (Loyal customer, disloyal customer)

**Age**: The actual age of the passengers

**Type of Travel**: Purpose of the flight of the passengers (Personal Travel, Business Travel)

**Class**: Travel class in the plane of the passengers (Business, Eco, Eco Plus)

**Flight distance**: The flight distance of this journey

**Inflight wifi service**: Satisfaction level of the inflight wifi service (0:Not Applicable;1-5)

**Departure/Arrival time convenient**: Satisfaction level of Departure/Arrival time convenient

**Ease of Online booking**: Satisfaction level of online booking

**Gate location**: Satisfaction level of Gate location

**Food and drink**: Satisfaction level of Food and drink

**Online boarding**: Satisfaction level of online boarding

**Seat comfort**: Satisfaction level of Seat comfort

**Inflight entertainment**: Satisfaction level of inflight entertainment

**On-board service**: Satisfaction level of On-board service

**Leg room service**: Satisfaction level of Leg room service

**Baggage handling**: Satisfaction level of baggage handling

**Check-in service**: Satisfaction level of Check-in service

**Inflight service**: Satisfaction level of inflight service

**Cleanliness**: Satisfaction level of Cleanliness

**Departure Delay in Minutes**: Minutes delayed when departure

**Arrival Delay in Minutes**: Minutes delayed when Arrival

**Satisfaction**: Airline satisfaction level(Satisfaction, neutral or dissatisfaction)

## Data Understanding

In [3]:
df_train.head(2).T

Unnamed: 0,0,1
id,70172,5047
Gender,Male,Male
Customer Type,Loyal Customer,disloyal Customer
Age,13,25
Type of Travel,Personal Travel,Business travel
Class,Eco Plus,Business
Flight Distance,460,235
Inflight wifi service,3,3
Departure/Arrival time convenient,4,2
Ease of Online booking,3,3


In [4]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,103904.0,64924.210502,37463.812252,1.0,32533.75,64856.5,97368.25,129880.0
Age,103904.0,39.379706,15.114964,7.0,27.0,40.0,51.0,85.0
Flight Distance,103904.0,1189.448375,997.147281,31.0,414.0,843.0,1743.0,4983.0
Inflight wifi service,103904.0,2.729683,1.327829,0.0,2.0,3.0,4.0,5.0
Departure/Arrival time convenient,103904.0,3.060296,1.525075,0.0,2.0,3.0,4.0,5.0
Ease of Online booking,103904.0,2.756901,1.398929,0.0,2.0,3.0,4.0,5.0
Gate location,103904.0,2.976883,1.277621,0.0,2.0,3.0,4.0,5.0
Food and drink,103904.0,3.202129,1.329533,0.0,2.0,3.0,4.0,5.0
Online boarding,103904.0,3.250375,1.349509,0.0,2.0,3.0,4.0,5.0
Seat comfort,103904.0,3.439396,1.319088,0.0,2.0,4.0,5.0,5.0


In [5]:
missing = df_train.isna().sum()
missing[missing > 0]

Arrival Delay in Minutes    310
dtype: int64

Less than **0.3%** of the values in the `Arrival Delay in Minutes` column are missing.

## Data Cleaning

In [6]:
cols = [column.lower().replace(' ', '_') for column in df_train.columns]

df_train.columns = cols
df_test.columns = cols

## Data Insights

In [7]:
df_train[df_train.arrival_delay_in_minutes.isna() == True].head().T

Unnamed: 0,213,1124,1529,2004,2108
id,49608,73442,71178,72940,116374
gender,Female,Male,Male,Female,Female
customer_type,Loyal Customer,Loyal Customer,Loyal Customer,disloyal Customer,Loyal Customer
age,38,53,39,26,24
type_of_travel,Business travel,Personal Travel,Business travel,Business travel,Personal Travel
class,Eco,Eco,Business,Business,Eco
flight_distance,109,1012,733,1035,417
inflight_wifi_service,5,3,2,3,2
departure/arrival_time_convenient,3,2,5,3,1
ease_of_online_booking,3,3,5,3,2


In [8]:
(df_train.departure_delay_in_minutes - df_train.arrival_delay_in_minutes).mean().round(3)

-0.431

In [9]:
# Create the Box Plot
fig = go.Figure()

# Add two box plots for the specified columns
fig.add_trace(go.Box(y=df_train['departure_delay_in_minutes'], name='Departure Delay'))
fig.add_trace(go.Box(y=df_train['arrival_delay_in_minutes'], name='Arrival Delay'))

# Customize the layout (optional)
fig.update_layout(
    title='Box Plot of Departure and Arrival Delays',
    yaxis_title='Minutes',
    boxmode='group',
    width=600
)

# Set the y-axis range to 0 to 400
fig.update_yaxes(range=[0, 50])

# Show the plot
fig.show()

In [10]:
# Create a histogram for 'satisfaction' column
fig = px.histogram(
    df_train, x='satisfaction', 
    title='Histogram of Satisfaction',
    color='satisfaction'
)

fig.update_layout(
    xaxis_title='Satisfaction', 
    yaxis_title='Frequency',
    width=600
)

# Show the plot
fig.show()

In [11]:
# Create a bar graph for 'age' vs 'satisfaction' with different colors
fig = px.histogram(
    df_train, x='age', color='satisfaction', 
    title='Bar Graph of Age vs Satisfaction'
)

fig.update_layout(xaxis_title='Age', yaxis_title='Count')

# Show the plot
fig.show()

## Missing Values

In [12]:
df_train.arrival_delay_in_minutes.median()

0.0

* Fill missing values with `median` as the distribution of feature `arrival_delay_in_minutes` is skewed.

In [13]:
df_train.arrival_delay_in_minutes.fillna(0, inplace=True)

## Encoding Categorical Features

In [14]:
df_train.select_dtypes(include='object').columns

Index(['gender', 'customer_type', 'type_of_travel', 'class', 'satisfaction'], dtype='object')

In [15]:
categorical_cols = [
    'gender', 
    'customer_type', 
    'type_of_travel', 
    'class',
    'inflight_wifi_service',
    'departure/arrival_time_convenient', 
    'ease_of_online_booking',
    'gate_location', 
    'food_and_drink', 
    'online_boarding', 
    'seat_comfort',
    'inflight_entertainment', 
    'on-board_service', 
    'leg_room_service',
    'baggage_handling', 
    'checkin_service', 
    'inflight_service',
    'cleanliness'
]

## Data Processing for Model

In [16]:
encoder = OneHotEncoder(sparse_output=False)

encoder.fit(df_train[categorical_cols])
encoded_columns = encoder.transform(df_train[categorical_cols])

encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

df_train.drop(columns=categorical_cols, inplace=True)

df = pd.concat([df_train, encoded_df], axis=1)

In [17]:
df.columns

Index(['id', 'age', 'flight_distance', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes', 'satisfaction', 'gender_Female',
       'gender_Male', 'customer_type_Loyal Customer',
       'customer_type_disloyal Customer', 'type_of_travel_Business travel',
       'type_of_travel_Personal Travel', 'class_Business', 'class_Eco',
       'class_Eco Plus', 'inflight_wifi_service_0', 'inflight_wifi_service_1',
       'inflight_wifi_service_2', 'inflight_wifi_service_3',
       'inflight_wifi_service_4', 'inflight_wifi_service_5',
       'departure/arrival_time_convenient_0',
       'departure/arrival_time_convenient_1',
       'departure/arrival_time_convenient_2',
       'departure/arrival_time_convenient_3',
       'departure/arrival_time_convenient_4',
       'departure/arrival_time_convenient_5', 'ease_of_online_booking_0',
       'ease_of_online_booking_1', 'ease_of_online_booking_2',
       'ease_of_online_booking_3', 'ease_of_online_booking_4',
       'ease_of_online_booking

In [18]:
X = df.drop(columns=['id', 'satisfaction']).reset_index(drop=True)
y = df.satisfaction.reset_index(drop=True)

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

In [20]:
X_train.head()

Unnamed: 0,age,flight_distance,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,customer_type_Loyal Customer,customer_type_disloyal Customer,type_of_travel_Business travel,type_of_travel_Personal Travel,...,inflight_service_2,inflight_service_3,inflight_service_4,inflight_service_5,cleanliness_0,cleanliness_1,cleanliness_2,cleanliness_3,cleanliness_4,cleanliness_5
98540,26,3750,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
39317,59,1440,7,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
61539,31,1417,0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
74319,26,1543,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7966,55,581,0,3.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## K-Nearest Neighbour Model

In [21]:
model = KNeighborsClassifier()

model.fit(X_train, y_train)

accuracy = model.score(X_valid, y_valid)

In [22]:
print('Accuracy:', accuracy)

Accuracy: 0.7015543044126846


## Random Forest Model

In [23]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

accuracy = model.score(X_valid, y_valid)

In [24]:
print('Accuracy:', accuracy)

Accuracy: 0.9617920215581541


* Calculating the accuracy on validation split of training data we see that `RandomForestClassifier` gives better results

## Hyperparameter Tuning Random Forest Model

In [25]:
parameter_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(max_features='sqrt'), parameter_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [26]:
accuracy = best_model.score(X_valid, y_valid)

In [27]:
print('Accuracy:', accuracy)

Accuracy: 0.9359511091862759
