Author:
        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

# Airline Passenger Satisfaction Classification

https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction/data


<p>
    <img src="https://www.travelandleisure.com/thmb/h97kSvljd2QYH2nUy3Y9ZNgO_pw=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/plane-data-BUSYROUTES1217-f4f84b08d47f4951b11c148cee2c3dea.jpg" width=600>
</p>

## 1. Load Libraries

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import torch
import pandas as pd

## 2. Load  Dataset

In [3]:
df_train = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/train.csv')
df_test = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/test.csv')
df_train.head()
df_test.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [5]:
# Check labels count
df_train[['satisfaction']].value_counts()

satisfaction           
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

## 3. Preprocessing

In [6]:
from  tools.preprocessing.data_frame import auto_preprocess_dataframe

target_columns = ['satisfaction']
drop_columns = ['Unnamed: 0','id']

# Assuming df_train and df_test are your initial dataframes
df_train_length = len(df_train)
df_test_length = len(df_test)

# Concatenate the dataframes
df = pd.concat([df_train, df_test], axis=0)

# Process the combined dataframe
df, description_train = auto_preprocess_dataframe(df, target_columns, drop_columns=drop_columns)

# Split the dataframe back into training and test sets
df_train = df.iloc[:df_train_length]
df_test = df.iloc[df_train_length:]

num_features = description_train['num_features']
num_classes = description_train['num_classes']

print(f"Number of features after scaling: {num_features}")
print(f"Number of classes after scaling: {num_classes}")
description_train

Column 'Gender' has 2 unique values.
Column 'Customer Type' has 2 unique values.
Column 'Type of Travel' has 2 unique values.
Column 'Class' has 3 unique values.
Column 'satisfaction' has 2 unique values.


Unnamed: 0,Min,Max,Mean,Std,Null Count
Age,-0.644929,3.311573,1.0,0.766831,0
Flight Distance,-0.611278,3.11203,0.2603088,0.750046,0
Inflight wifi service,-1.0,2.664958,1.0,0.974318,0
Departure/Arrival time convenient,-1.0,2.270807,1.0,0.998765,0
Ease of Online booking,-1.0,2.627412,1.0,1.016881,0
Gate location,-1.0,2.359189,1.0,0.858949,0
Food and drink,-1.0,2.120432,1.0,0.829976,0
Online boarding,-1.0,2.07435,1.0,0.830475,0
Seat comfort,-1.0,1.905635,1.0,0.766604,0
Inflight entertainment,-1.0,1.977904,1.0,0.794594,0


Number of features after scaling: 24
Number of classes after scaling: 2


{'num_features': 24,
 'num_classes': 2,
 'encoded_columns': Index(['cnets_processed_Class', 'cnets_processed_Customer Type',
        'cnets_processed_Gender', 'cnets_processed_Type of Travel',
        'cnets_processed_satisfaction'],
       dtype='object'),
 'scalers': {'Age': 'minmax',
  'Arrival Delay in Minutes': 'robust',
  'Baggage handling': 'standard',
  'Checkin service': 'minmax',
  'Cleanliness': 'minmax',
  'Departure Delay in Minutes': 'robust',
  'Departure/Arrival time convenient': 'minmax',
  'Ease of Online booking': 'minmax',
  'Flight Distance': 'robust',
  'Food and drink': 'minmax',
  'Gate location': 'minmax',
  'Inflight entertainment': 'minmax',
  'Inflight service': 'standard',
  'Inflight wifi service': 'minmax',
  'Leg room service': 'minmax',
  'On-board service': 'minmax',
  'Online boarding': 'minmax',
  'Seat comfort': 'minmax'}}

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Age                                103904 non-null  float64
 1   Flight Distance                    103904 non-null  float64
 2   Inflight wifi service              103904 non-null  float64
 3   Departure/Arrival time convenient  103904 non-null  float64
 4   Ease of Online booking             103904 non-null  float64
 5   Gate location                      103904 non-null  float64
 6   Food and drink                     103904 non-null  float64
 7   Online boarding                    103904 non-null  float64
 8   Seat comfort                       103904 non-null  float64
 9   Inflight entertainment             103904 non-null  float64
 10  On-board service                   103904 non-null  float64
 11  Leg room service                   1039

In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25583 entries, 103904 to 129486
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                25583 non-null  float64
 1   Flight Distance                    25583 non-null  float64
 2   Inflight wifi service              25583 non-null  float64
 3   Departure/Arrival time convenient  25583 non-null  float64
 4   Ease of Online booking             25583 non-null  float64
 5   Gate location                      25583 non-null  float64
 6   Food and drink                     25583 non-null  float64
 7   Online boarding                    25583 non-null  float64
 8   Seat comfort                       25583 non-null  float64
 9   Inflight entertainment             25583 non-null  float64
 10  On-board service                   25583 non-null  float64
 11  Leg room service                   25583 non-nul

In [9]:
# Custom Dataset Class
class AirlineDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.long)
        return vals, label

In [10]:

# Split the data into train, validation and test sets
X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1:].values
X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:,-1:].values

trainset = AirlineDataset(X_train, y_train)
testset = AirlineDataset(X_test, y_test)

print(f"Labeled Trainset Shape: {len(trainset)}, {trainset.x.shape[1]}")
print(f"Labeled Testset Shape: {len(testset)}, {testset.x.shape[1]}")

Labeled Trainset Shape: 103904, 24
Labeled Testset Shape: 25583, 24


In [11]:
num_features =  trainset.x.shape[1]
num_classes =  trainset.y.shape[1]

num_features, num_classes 

(24, 1)

# Training

In [12]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_config import MLConfig
from causal_learning import CausalLearning

# Set the data configuration
data_config = DataConfig(dataset_name = 'airline_satisfaction', task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

#  Set the ML parameters
ml_config = MLConfig(model_name = 'tabnet')
ml_config.training.error_function = 'mae'
ml_config.model.num_layers = 3

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the CausalLearning class with the training configuration, data configuration, device, and use_print and use_wandb flags
causal_learning = CausalLearning(ml_config, data_config, device, use_print=True, use_wandb=False)

In [13]:
causal_learning.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/1623 [00:00<?, ?it/s]

[0/100][100/1623][Time 10.35]
Unified LR across all optimizers: 0.0001993957766378747
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.2229	Gen: 0.5466	Rec: 0.5480	E: 0.2215	R: 0.2242	P: 0.8718
--------------------Test Metrics------------------------
accuracy: 0.8906
precision: 0.9007
recall: 0.9128
f1_score: 0.9067

[0/100][200/1623][Time 9.67]
Unified LR across all optimizers: 0.00019879933411171295
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.1360	Gen: 0.4248	Rec: 0.4246	E: 0.1362	R: 0.1359	P: 0.7134
--------------------Test Metrics------------------------
accuracy: 0.8711
precision: 0.8828
recall: 0.8889
f1_score: 0.8858

[0/100][300/1623][Time 9.49]
Unified LR across all optimizers: 0.00019820467569398644
--------------------Training Metrics--------------------
CCNet:  Three Tabnet
Inf: 0.0977	Gen: 0.4026	Rec: 0.4019	E: 0.0983	R: 0.0971	P: 0.7068
--------------------Test Metrics------------------------
acc

In [None]:
causal_learning.test(testset)