Author:
        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

# Airline Passenger Satisfaction Classification

https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction/data


<p>
    <img src="https://www.travelandleisure.com/thmb/h97kSvljd2QYH2nUy3Y9ZNgO_pw=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/plane-data-BUSYROUTES1217-f4f84b08d47f4951b11c148cee2c3dea.jpg" width=600>
</p>

## 1. Load Libraries

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import torch
import pandas as pd

## 2. Load  Dataset

In [3]:
df = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/Airline_customer_satisfaction.csv')
df.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Customer Type                      129880 non-null  object 
 2   Age                                129880 non-null  int64  
 3   Type of Travel                     129880 non-null  object 
 4   Class                              129880 non-null  object 
 5   Flight Distance                    129880 non-null  int64  
 6   Seat comfort                       129880 non-null  int64  
 7   Departure/Arrival time convenient  129880 non-null  int64  
 8   Food and drink                     129880 non-null  int64  
 9   Gate location                      129880 non-null  int64  
 10  Inflight wifi service              129880 non-null  int64  
 11  Inflight entertainment             1298

In [5]:
# Check labels count
df[['satisfaction']].value_counts()

satisfaction
satisfied       71087
dissatisfied    58793
Name: count, dtype: int64

## 3. Preprocessing

In [6]:
# Check for missing values amd drop 
df_subset = df.dropna(axis=0).reset_index(drop = True)

In [7]:
# Label encoding
df_subset['Class'] = df_subset['Class'].map({"Business": 3, "Eco Plus": 2, "Eco": 1}) 
df_subset['satisfaction'] = df_subset['satisfaction'].map({"satisfied": 1,  "dissatisfied": 0})

# One hot encoding
df_subset = pd.get_dummies(df_subset, drop_first = True)

In [10]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
sc = StandardScaler()
df_subset.iloc[:, 1:] = sc.fit_transform(df_subset.iloc[:, :-1])

n_features = len(df_subset.iloc[:, :-1].columns)
n_classes = len(df_subset.iloc[:, -1:].columns)

print(n_features, n_classes)

21 1


In [11]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129487 entries, 0 to 129486
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129487 non-null  int64  
 1   Age                                129487 non-null  float64
 2   Class                              129487 non-null  float64
 3   Flight Distance                    129487 non-null  float64
 4   Seat comfort                       129487 non-null  float64
 5   Departure/Arrival time convenient  129487 non-null  float64
 6   Food and drink                     129487 non-null  float64
 7   Gate location                      129487 non-null  float64
 8   Inflight wifi service              129487 non-null  float64
 9   Inflight entertainment             129487 non-null  float64
 10  Online support                     129487 non-null  float64
 11  Ease of Online booking             1294

In [12]:
# Custom Dataset Class
class AirlineDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.float32)
        return vals, label

In [13]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation and test sets
df_train, df_test = train_test_split(df_subset, test_size=0.2, shuffle=True, random_state=33)

df_train, df_val = train_test_split(df_train, test_size=0.25, shuffle=True, random_state=33)  # 0.25 * 0.8 = 0.2

X_train, y_train = df_train.iloc[:, 1:].values, df_train.iloc[:, :1].values
X_val, y_val = df_val.iloc[:, 1:].values, df_val.iloc[:, :1].values
X_test, y_test = df_test.iloc[:, 1:].values, df_test.iloc[:, :1].values

trainset = AirlineDataset(X_train, y_train)
valset = AirlineDataset(X_val, y_val)
testset = AirlineDataset(X_test, y_test)

print(f"Labeled Trainset Shape: {len(trainset)}, {trainset.x.shape[1]}")
print(f"Labeled Valset Shape: {len(valset)}, {valset.x.shape[1]}")
print(f"Labeled Testset Shape: {len(testset)}, {testset.x.shape[1]}")

Labeled Trainset Shape: 77691, 21
Labeled Valset Shape: 25898, 21
Labeled Testset Shape: 25898, 21


In [14]:
# Check the shape of the first element in the trainset
trainset[0][0].shape, trainset[0][1].shape # ( = obs_shape, label_shape)

(torch.Size([21]), torch.Size([1]))

# Training

In [15]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

# Set the data configuration
data_config = DataConfig(dataset_name = 'airline_satisfaction', task_type='binary_classification', obs_shape=[21], label_size=1)

#  Set the ML parameters
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

ml_params.num_epoch = 1
ml_params.optimization.decay_rate_100k = 0.01

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, print_interval=30)

In [16]:
trainer_hub.train(trainset, valset)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Iterations:   0%|          | 0/1213 [00:00<?, ?it/s]

[0/1][5/1213][Time 0.90]
Unified LR across all optimizers: 0.00019994474559179828
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.3877	Gen: 0.9916	Rec: 1.0403	E: 0.3390	R: 0.4364	P: 1.6442
--------------------Test Metrics------------------------
accuracy: 0.8945
precision: 0.8640
recall: 0.9153
f1_score: 0.8889

[0/1][10/1213][Time 0.59]
Unified LR across all optimizers: 0.0001998987119127626
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.1997	Gen: 0.5263	Rec: 0.5377	E: 0.1883	R: 0.2111	P: 0.8643
--------------------Test Metrics------------------------
accuracy: 0.8359
precision: 0.7550
recall: 0.9580
f1_score: 0.8444

[0/1][15/1213][Time 0.58]
Unified LR across all optimizers: 0.00019985268883215303
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0900	Gen: 0.4383	Rec: 0.4440	E: 0.0843	R: 0.0957	P: 0.7922
--------------------Test Metrics------------------------
accuracy: 0.8125
precision: 0.7152


In [17]:
trainer_hub.test(testset)

{'accuracy': 0.9969109296798706,
 'precision': 0.9948239287229529,
 'recall': 0.998382014817338,
 'f1_score': 0.9965997959877593}