# Airline Passenger Satisfaction Analysis

### <a href=https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction>Dataset</a>

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time




In [2]:
def handle_outliers(df, numeric_columns):
    def find_outliers_IQR(df, column):
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
        return outliers

    outliers_indices = set()
    for column in numeric_columns:
        outliers_indices.update(find_outliers_IQR(df, column))
    
    df_no_outliers = df.drop(index=outliers_indices)
    return df_no_outliers

In [3]:
def initialize_encoders(df, categorical_columns):
    label_encoders = {}
    for column in categorical_columns:
        le = LabelEncoder()
        le.fit(df[column].astype(str))
        label_encoders[column] = le
    return label_encoders

def apply_encoders(df, categorical_columns, label_encoders):
    for column in categorical_columns:
        le = label_encoders[column]
        df[column] = le.transform(df[column].astype(str))
    return df

def initialize_scaler(df, numeric_columns):
    scaler = RobustScaler()
    scaler.fit(df[numeric_columns])
    return scaler

def apply_scaler(df, numeric_columns, scaler):
    df[numeric_columns] = scaler.transform(df[numeric_columns])
    return df

In [4]:
def impute_missing_values(df, categorical_columns, numeric_columns):
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
    
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])
    
    return df

In [5]:
def preprocess_data(df, label_encoders=None, scaler=None, is_train=True):
    df = df.drop(["Unnamed: 0", "id"], axis=1, errors='ignore')

    threshold = 10
    categorical_columns = [col for col in df.columns if df[col].nunique() < threshold and col != "satisfaction"]
    numeric_columns = [col for col in df.columns if col not in categorical_columns + ["satisfaction"]]

    # Handle outliers only if it is training data
    if is_train:
        df = handle_outliers(df, numeric_columns)

    # Encode categorical variables
    if is_train:
        label_encoders = initialize_encoders(df, categorical_columns)
    df = apply_encoders(df, categorical_columns, label_encoders)

    # Impute missing values
    df = impute_missing_values(df, categorical_columns, numeric_columns)

    # Scale numerical variables
    if is_train:
        scaler = initialize_scaler(df, numeric_columns)
    df = apply_scaler(df, numeric_columns, scaler)

    # Encode the target variable to 0 and 1
    target = None
    if 'satisfaction' in df.columns:
        target = df['satisfaction'].apply(lambda x: 1 if x == 'satisfied' else 0)
        df = df.drop(['satisfaction'], axis=1)

    return df, target, label_encoders, scaler

In [6]:
def evaluate_model(model_name, model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    cm = confusion_matrix(y, y_pred)
    
    print("Model:", model_name)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)

In [7]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')

In [8]:
X_train, y_train, label_encoders, scaler = preprocess_data(df_train, is_train=True)

In [9]:
X_test, y_test, _, _ = preprocess_data(df_test, label_encoders=label_encoders, scaler=scaler, is_train=False)

In [10]:
models = {
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}


In [11]:
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    evaluate_model(model_name, model, X_test, y_test)
    print(f"Training time for {model_name}: {training_time:.4f} seconds\n")

Model: Gaussian Naive Bayes
Accuracy: 0.8357329842931938
Precision: 0.8389220378915871
Recall: 0.8357329842931938
F1 Score: 0.8336894539937724
Confusion Matrix:
 [[13276  1297]
 [ 2970  8433]]
Training time for Gaussian Naive Bayes: 0.0587 seconds

Model: Logistic Regression
Accuracy: 0.8561364336310441
Precision: 0.8561692310140316
Recall: 0.8561364336310441
F1 Score: 0.8555825490151608
Confusion Matrix:
 [[13036  1537]
 [ 2200  9203]]
Training time for Logistic Regression: 0.4039 seconds

Model: K-Nearest Neighbors
Accuracy: 0.9136510625192485
Precision: 0.9147830962723809
Recall: 0.9136510625192485
F1 Score: 0.9132011216880185
Confusion Matrix:
 [[13878   695]
 [ 1548  9855]]
Training time for K-Nearest Neighbors: 0.0157 seconds

Model: Decision Tree
Accuracy: 0.941137973514013
Precision: 0.9411444691086673
Recall: 0.941137973514013
F1 Score: 0.9411410503472337
Confusion Matrix:
 [[13803   770]
 [  759 10644]]
Training time for Decision Tree: 0.4793 seconds

Model: Random Forest
Acc



Model: AdaBoost
Accuracy: 0.9266245765321836
Precision: 0.9265789924049591
Recall: 0.9266245765321836
F1 Score: 0.9265633467272038
Confusion Matrix:
 [[13703   870]
 [ 1036 10367]]
Training time for AdaBoost: 2.6899 seconds

[LightGBM] [Info] Number of positive: 37582, number of negative: 47787
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 485
[LightGBM] [Info] Number of data points in the train set: 85369, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.440230 -> initscore=-0.240228
[LightGBM] [Info] Start training from score -0.240228
Model: LightGBM
Accuracy: 0.9633122882660918
Precision: 0.96363453287548
Recall: 0.9633122882660918
F1 Score: 0.9632316012284768
Confusion Matrix:
 [[14300   273]
 [  680 10723]]
Training time for LightGBM: 0.3042 seco