Author:
        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

# Airline Passenger Satisfaction Classification

https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction/data


<p>
    <img src="https://www.travelandleisure.com/thmb/h97kSvljd2QYH2nUy3Y9ZNgO_pw=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/plane-data-BUSYROUTES1217-f4f84b08d47f4951b11c148cee2c3dea.jpg" width=600>
</p>

## 1. Load Libraries

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import torch
import pandas as pd

## 2. Load  Dataset

In [3]:
df_train = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/train.csv')
df_test = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/test.csv')
df_train.head()
df_test.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [5]:
# Check labels count
df_train[['satisfaction']].value_counts()

satisfaction           
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

## 3. Preprocessing

In [6]:
from tools.preprocessing.data_frame import process_dataframe

target_columns = ['satisfaction']
drop_columns = ['Unnamed: 0','id']

# Assuming df_train and df_test are your initial dataframes
df_train_length = len(df_train)
df_test_length = len(df_test)

# Concatenate the dataframes
df = pd.concat([df_train, df_test], axis=0)

# Process the combined dataframe
df, description_train = process_dataframe(df, target_columns, drop_columns=drop_columns)

# Split the dataframe back into training and test sets
df_train = df.iloc[:df_train_length]
df_test = df.iloc[df_train_length:]

num_features = description_train['num_features']
num_classes = description_train['num_classes']

print(f"Number of features after scaling: {num_features}")
print(f"Number of classes after scaling: {num_classes}")
description_train

Column 'Gender' has 2 unique values.
Column 'Customer Type' has 2 unique values.
Column 'Type of Travel' has 2 unique values.
Column 'Class' has 3 unique values.
Column 'satisfaction' has 2 unique values.
Number of features after scaling: 24
Number of classes after scaling: 1


{'num_features': 24,
 'num_classes': 1,
 'encoded_columns': Index(['Gender', 'Customer Type', 'Type of Travel', 'Class'], dtype='object'),
 'target_encoded_columns': Index(['satisfaction'], dtype='object'),
 'scalers': {'Age': 'minmax',
  'Arrival Delay in Minutes': 'robust',
  'Baggage handling': 'standard',
  'Checkin service': 'robust',
  'Cleanliness': 'minmax',
  'Departure Delay in Minutes': 'robust',
  'Departure/Arrival time convenient': 'minmax',
  'Ease of Online booking': 'minmax',
  'Flight Distance': 'robust',
  'Food and drink': 'minmax',
  'Gate location': 'minmax',
  'Inflight entertainment': 'minmax',
  'Inflight service': 'standard',
  'Inflight wifi service': 'minmax',
  'Leg room service': 'minmax',
  'On-board service': 'minmax',
  'Online boarding': 'minmax',
  'Seat comfort': 'minmax'},
 'target_scalers': {}}

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  float64
 1   Customer Type                      103904 non-null  float64
 2   Age                                103904 non-null  float64
 3   Type of Travel                     103904 non-null  float64
 4   Flight Distance                    103904 non-null  float64
 5   Inflight wifi service              103904 non-null  float64
 6   Departure/Arrival time convenient  103904 non-null  float64
 7   Ease of Online booking             103904 non-null  float64
 8   Gate location                      103904 non-null  float64
 9   Food and drink                     103904 non-null  float64
 10  Online boarding                    103904 non-null  float64
 11  Seat comfort                       1039

In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25583 entries, 103904 to 129486
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             25583 non-null  float64
 1   Customer Type                      25583 non-null  float64
 2   Age                                25583 non-null  float64
 3   Type of Travel                     25583 non-null  float64
 4   Flight Distance                    25583 non-null  float64
 5   Inflight wifi service              25583 non-null  float64
 6   Departure/Arrival time convenient  25583 non-null  float64
 7   Ease of Online booking             25583 non-null  float64
 8   Gate location                      25583 non-null  float64
 9   Food and drink                     25583 non-null  float64
 10  Online boarding                    25583 non-null  float64
 11  Seat comfort                       25583 non-nul

In [9]:
# Custom Dataset Class
class AirlineDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.long)
        return vals, label

In [10]:

# Split the data into train, validation and test sets
X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1:].values
X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:,-1:].values

trainset = AirlineDataset(X_train, y_train)
testset = AirlineDataset(X_test, y_test)

print(f"Labeled Trainset Shape: {len(trainset)}, {trainset.x.shape[1]}")
print(f"Labeled Testset Shape: {len(testset)}, {testset.x.shape[1]}")

Labeled Trainset Shape: 103904, 24
Labeled Testset Shape: 25583, 24


In [11]:
num_features =  trainset.x.shape[1]
num_classes =  trainset.y.shape[1]

num_features, num_classes 

(24, 1)

# Training

In [None]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

# Set the data configuration
data_config = DataConfig(dataset_name = 'airline_satisfaction', task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

#  Set the ML parameters
ml_params = MLParameters(ccnet_network = 'tabnet', encoder_network = 'none')
ml_params.algorithm.error_function = 'mae'
ml_params.model.ccnet_config.num_layers = 4

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)

In [13]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/1623 [00:00<?, ?it/s]

[0/100][50/1623][Time 5.85]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.2599	Gen: 0.5568	Rec: 0.5585	E: 0.2583	R: 0.2616	P: 0.8554
--------------------Test Metrics------------------------
accuracy: 0.8047
precision: 0.8788
recall: 0.7733
f1_score: 0.8227

[0/100][100/1623][Time 5.27]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.1387	Gen: 0.3156	Rec: 0.3139	E: 0.1404	R: 0.1370	P: 0.4908
--------------------Test Metrics------------------------
accuracy: 0.8164
precision: 0.8806
recall: 0.7919
f1_score: 0.8339

[0/100][150/1623][Time 5.21]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.1103	Gen: 0.2595	Rec: 0.2585	E: 0.1114	R: 0.1093	P: 0.4076
--------------------Test Metrics------------------------
accur

Iterations:   0%|          | 0/1623 [00:00<?, ?it/s]

[1/100][27/1623][Time 5.42]
Unified LR across all optimizers: 0.00018535742845428288
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0395	Gen: 0.1179	Rec: 0.1171	E: 0.0403	R: 0.0387	P: 0.1956
--------------------Test Metrics------------------------
accuracy: 0.9258
precision: 0.9281
recall: 0.9348
f1_score: 0.9314

[1/100][77/1623][Time 5.31]
Unified LR across all optimizers: 0.00018493111819882223
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0384	Gen: 0.1200	Rec: 0.1186	E: 0.0398	R: 0.0370	P: 0.2002
--------------------Test Metrics------------------------
accuracy: 0.9141
precision: 0.9286
recall: 0.9155
f1_score: 0.9220

[1/100][127/1623][Time 5.25]
Unified LR across all optimizers: 0.00018450578842974107
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0384	Gen: 0.1192	Rec: 0.1183	E: 0.0393	R: 0.0375	P: 0.1991
--------------------Test Metrics------------------------
accur

Iterations:   0%|          | 0/1623 [00:00<?, ?it/s]

[2/100][4/1623][Time 5.28]
Unified LR across all optimizers: 0.00017219082057399394
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0320	Gen: 0.1046	Rec: 0.1034	E: 0.0332	R: 0.0309	P: 0.1760
--------------------Test Metrics------------------------
accuracy: 0.8750
precision: 0.8650
recall: 0.9338
f1_score: 0.8981

[2/100][54/1623][Time 5.19]
Unified LR across all optimizers: 0.00017179479267633146
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0293	Gen: 0.1016	Rec: 0.1004	E: 0.0305	R: 0.0282	P: 0.1727
--------------------Test Metrics------------------------
accuracy: 0.9141
precision: 0.8912
recall: 0.9562
f1_score: 0.9225

[2/100][104/1623][Time 5.20]
Unified LR across all optimizers: 0.00017139967561755819
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0313	Gen: 0.0999	Rec: 0.0995	E: 0.0317	R: 0.0309	P: 0.1681
--------------------Test Metrics------------------------
accura

Iterations:   0%|          | 0/1623 [00:00<?, ?it/s]

[3/100][31/1623][Time 5.28]
Unified LR across all optimizers: 0.00015959158782013816
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0267	Gen: 0.0969	Rec: 0.0966	E: 0.0271	R: 0.0264	P: 0.1667
--------------------Test Metrics------------------------
accuracy: 0.9219
precision: 0.9041
recall: 0.9565
f1_score: 0.9296

[3/100][81/1623][Time 5.22]
Unified LR across all optimizers: 0.00015922453735369438
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0272	Gen: 0.0953	Rec: 0.0947	E: 0.0278	R: 0.0267	P: 0.1627
--------------------Test Metrics------------------------
accuracy: 0.9219
precision: 0.9470
recall: 0.9226
f1_score: 0.9346

[3/100][131/1623][Time 5.50]
Unified LR across all optimizers: 0.00015885833107989733
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0246	Gen: 0.0916	Rec: 0.0902	E: 0.0260	R: 0.0233	P: 0.1571
--------------------Test Metrics------------------------
accur

In [None]:
trainer_hub.test(testset)