Author:
        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

# Airline Passenger Satisfaction Classification

https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction/data


<p>
    <img src="https://www.travelandleisure.com/thmb/h97kSvljd2QYH2nUy3Y9ZNgO_pw=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/plane-data-BUSYROUTES1217-f4f84b08d47f4951b11c148cee2c3dea.jpg" width=600>
</p>

## 1. Load Libraries

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import torch
import pandas as pd

## 2. Load  Dataset

In [3]:
df_train = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/train.csv')
df_test = pd.read_csv(path_append + '../data/Airline Customer Satisfaction/test.csv')
df_train.head()
df_test.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [5]:
# Check labels count
df_train[['satisfaction']].value_counts()

satisfaction           
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

## 3. Preprocessing

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def process_dataframe(df):
    # Drop columns that contain "Unnamed" in their names and the "id" column
    df_cleaned = df.drop(columns=[col for col in df.columns if 'Unnamed' in col or col.lower() == 'id'])
    
    # Check for missing values and drop them
    df_cleaned = df_cleaned.dropna(axis=0).reset_index(drop=True)
    
    # Identify string-type columns
    str_columns = df_cleaned.select_dtypes(include=['object']).columns

    # Lists to hold names of columns that will be converted
    binary_columns = []
    one_hot_columns = []

    # Process string-type columns based on the number of unique values
    for col in str_columns:
        unique_values = df_cleaned[col].nunique()
        print(f"Column '{col}' has {unique_values} unique values.")
        if unique_values == 1:
            # Drop columns with only 1 unique value
            df_cleaned = df_cleaned.drop(columns=[col])
        elif unique_values == 2:
            # Mark columns with exactly 2 unique values for binary conversion
            binary_columns.append(col)
        else:
            # Mark columns with more than 2 unique values for one-hot encoding
            one_hot_columns.append(col)
    
    # Binary encoding for columns with exactly 2 unique values
    for col in binary_columns:
        df_cleaned[col] = pd.get_dummies(df_cleaned[col], drop_first=True).astype(float)
    
    # One-hot encoding for columns with more than 2 unique values
    df_encoded = pd.get_dummies(df_cleaned, columns=one_hot_columns, drop_first=False).astype(float)
    
    # Move satisfaction column to the end
    satisfaction = df_encoded.pop('satisfaction')
    df_encoded['satisfaction'] = satisfaction

    # Identify the new column names created by one-hot encoding
    new_one_hot_columns = [col for original_col in one_hot_columns for col in df_encoded.columns if original_col + '_' in col]

    # Standardize the features except the converted columns and the 'satisfaction' column
    non_converted_columns = [col for col in df_encoded.columns if col not in binary_columns + new_one_hot_columns + ['satisfaction']]
    sc = StandardScaler()
    df_encoded[non_converted_columns] = sc.fit_transform(df_encoded[non_converted_columns])
    
    return df_encoded

def get_feature_and_class_counts(df):
    n_features = df.iloc[:, :-1].shape[1]
    n_classes = df.iloc[:, -1].nunique()
    return n_features, n_classes


In [7]:
# Load and process the data
df_train_processed = process_dataframe(df_train)
df_test_processed = process_dataframe(df_test)

# Get feature and class counts
n_features, n_classes = get_feature_and_class_counts(df_train_processed)
print(n_features, n_classes)

Column 'Gender' has 2 unique values.
Column 'Customer Type' has 2 unique values.
Column 'Type of Travel' has 2 unique values.
Column 'Class' has 3 unique values.
Column 'satisfaction' has 2 unique values.
Column 'Gender' has 2 unique values.
Column 'Customer Type' has 2 unique values.
Column 'Type of Travel' has 2 unique values.
Column 'Class' has 3 unique values.
Column 'satisfaction' has 2 unique values.
24 2


In [8]:
df_train_processed.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,...,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Class_Business,Class_Eco,Class_Eco Plus,satisfaction
0,1.0,0.0,-1.745542,1.0,-0.731305,0.203521,0.616249,0.173716,-1.547312,1.352401,...,0.311853,0.549773,1.156211,1.305913,0.268966,0.072905,0.0,0.0,1.0,0.0
1,1.0,1.0,-0.951526,0.0,-0.956916,0.203521,-0.695032,0.173716,0.017981,-1.656487,...,-0.534854,-1.821038,0.30558,-1.742432,-0.360682,-0.237184,1.0,0.0,0.0,0.0
2,0.0,0.0,-0.885358,0.0,-0.047454,-0.549571,-0.695032,-0.541118,-0.764666,1.352401,...,0.311853,0.549773,0.30558,1.305913,-0.386917,-0.392229,1.0,0.0,0.0,1.0
3,0.0,0.0,-0.951526,0.0,-0.629028,-0.549571,1.27189,1.603383,1.583273,-0.904265,...,-0.534854,-1.821038,0.30558,-0.980345,-0.098328,-0.159662,1.0,0.0,0.0,0.0
4,1.0,0.0,1.430521,0.0,-0.977973,0.203521,-0.039391,0.173716,0.017981,0.600179,...,0.311853,-0.240497,-0.545051,-0.218259,-0.386917,-0.392229,1.0,0.0,0.0,1.0


In [9]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103594 entries, 0 to 103593
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103594 non-null  float64
 1   Customer Type                      103594 non-null  float64
 2   Age                                103594 non-null  float64
 3   Type of Travel                     103594 non-null  float64
 4   Flight Distance                    103594 non-null  float64
 5   Inflight wifi service              103594 non-null  float64
 6   Departure/Arrival time convenient  103594 non-null  float64
 7   Ease of Online booking             103594 non-null  float64
 8   Gate location                      103594 non-null  float64
 9   Food and drink                     103594 non-null  float64
 10  Online boarding                    103594 non-null  float64
 11  Seat comfort                       1035

In [10]:
# Custom Dataset Class
class AirlineDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.long)
        return vals, label

In [11]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation and test sets
df_train, df_eval = train_test_split(df_train_processed, test_size=0.5, shuffle=True, random_state=33)  # 0.25 * 0.8 = 0.2
df_test = df_test_processed
X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1:].values
X_val, y_val = df_eval.iloc[:, :-1].values, df_eval.iloc[:, -1:].values
X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:,-1:].values

trainset = AirlineDataset(X_train, y_train)
valset = AirlineDataset(X_val, y_val)
testset = AirlineDataset(X_test, y_test)

print(f"Labeled Trainset Shape: {len(trainset)}, {trainset.x.shape[1]}")
print(f"Labeled Valset Shape: {len(valset)}, {valset.x.shape[1]}")
print(f"Labeled Testset Shape: {len(testset)}, {testset.x.shape[1]}")

Labeled Trainset Shape: 51797, 24
Labeled Valset Shape: 51797, 24
Labeled Testset Shape: 25893, 24


In [12]:
num_features =  trainset.x.shape[1]
num_classes =  trainset.y.shape[1]

num_features, num_classes 

(24, 1)

# Training

In [13]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

# Set the data configuration
data_config = DataConfig(dataset_name = 'airline_satisfaction', task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

#  Set the ML parameters
ml_params = MLParameters(ccnet_network = 'tabnet', encoder_network = 'none')
ml_params.training.num_epoch = 10

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False)

In [14]:
trainer_hub.train(trainset, valset)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Iterations:   0%|          | 0/809 [00:00<?, ?it/s]

[0/10][50/809][Time 9.93]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.3165	Gen: 0.7336	Rec: 0.7328	E: 0.2272	R: 0.2201	P: 1.5900
--------------------Test Metrics------------------------
accuracy: 0.8555
precision: 0.8741
recall: 0.8681
f1_score: 0.8711

[0/10][100/809][Time 9.00]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.1734	Gen: 0.4208	Rec: 0.4244	E: 0.0562	R: 0.0636	P: 0.5041
--------------------Test Metrics------------------------
accuracy: 0.8594
precision: 0.8707
recall: 0.8828
f1_score: 0.8767

[0/10][150/809][Time 9.19]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.1468	Gen: 0.3496	Rec: 0.3491	E: 0.0408	R: 0.0437	P: 0.3370
--------

Iterations:   0%|          | 0/809 [00:00<?, ?it/s]

[1/10][41/809][Time 9.14]
Unified LR across all optimizers: 0.0001923135991042739
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0728	Gen: 0.1908	Rec: 0.1910	E: 0.0090	R: 0.0099	P: 0.1077
--------------------Test Metrics------------------------
accuracy: 0.8789
precision: 0.8851
recall: 0.9034
f1_score: 0.8942

[1/10][91/809][Time 8.75]
Unified LR across all optimizers: 0.0001918712901002789
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0758	Gen: 0.1857	Rec: 0.1844	E: 0.0100	R: 0.0100	P: 0.0966
--------------------Test Metrics------------------------
accuracy: 0.9023
precision: 0.8816
recall: 0.9504
f1_score: 0.9147

[1/10][141/809][Time 8.38]
Unified LR across all optimizers: 0.00019142999837876384
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0723	Gen: 0.1848	Rec: 0.1834	E: 0.0096	R: 0.0090	P: 0.0976
----------

Iterations:   0%|          | 0/809 [00:00<?, ?it/s]

[2/10][32/809][Time 8.78]
Unified LR across all optimizers: 0.00018535742845428288
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0633	Gen: 0.1651	Rec: 0.1651	E: 0.0073	R: 0.0082	P: 0.0814
--------------------Test Metrics------------------------
accuracy: 0.8828
precision: 0.8940
recall: 0.9060
f1_score: 0.9000

[2/10][82/809][Time 8.91]
Unified LR across all optimizers: 0.00018493111819882223
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0658	Gen: 0.1610	Rec: 0.1588	E: 0.0074	R: 0.0071	P: 0.0717
--------------------Test Metrics------------------------
accuracy: 0.9219
precision: 0.9539
recall: 0.9177
f1_score: 0.9355

[2/10][132/809][Time 8.78]
Unified LR across all optimizers: 0.00018450578842974107
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0610	Gen: 0.1577	Rec: 0.1565	E: 0.0063	R: 0.0064	P: 0.0728
--------

Iterations:   0%|          | 0/809 [00:00<?, ?it/s]

[3/10][23/809][Time 9.12]
Unified LR across all optimizers: 0.00017865286928854052
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0556	Gen: 0.1484	Rec: 0.1466	E: 0.0055	R: 0.0050	P: 0.0641
--------------------Test Metrics------------------------
accuracy: 0.8945
precision: 0.9091
recall: 0.9028
f1_score: 0.9059

[3/10][73/809][Time 8.94]
Unified LR across all optimizers: 0.00017824197909125899
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0530	Gen: 0.1437	Rec: 0.1428	E: 0.0047	R: 0.0045	P: 0.0609
--------------------Test Metrics------------------------
accuracy: 0.8867
precision: 0.8631
recall: 0.9603
f1_score: 0.9091

[3/10][123/809][Time 9.00]
Unified LR across all optimizers: 0.00017783203391520723
--------------------Training Metrics--------------------
Cooperative Network(core):  Three Tabnet
Inf: 0.0544	Gen: 0.1493	Rec: 0.1473	E: 0.0055	R: 0.0050	P: 0.0676
--------

In [None]:
trainer_hub.test(testset)