In [1]:

import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib


In [7]:
# ---------- Utility: try to load dataset from common filenames ----------
POSSIBLE_FILENAMES = ['dummy_transport_satisfaction.csv']
print(POSSIBLE_FILENAMES)

['dummy_transport_satisfaction.csv']


In [8]:


def load_dataset(path=None):
    if path:
        df = pd.read_csv(path)
        print(f"Loaded dataset from provided path: {path}")
        return df
    for fn in POSSIBLE_FILENAMES:
        if os.path.exists(fn):
            print(f"Loaded dataset file: {fn}")
            return pd.read_csv(fn)
    raise FileNotFoundError(
        'No dataset found. Please place CSV in working directory or pass path to load_dataset(path)'
    )



In [12]:
df=load_dataset()
print(df)

Loaded dataset file: dummy_transport_satisfaction.csv
        id  Gender  Age Travel category Travel Class  Distance Travelled  \
0        1    Male   27        Personal      Premium                1693   
1        2    Male   35        Business      Premium                1229   
2        3  Female   71        Personal      Premium                 573   
3        4    Male   19        Business     Business                1640   
4        5    Male   39        Business     Business                 358   
...    ...     ...  ...             ...          ...                 ...   
4995  4996    Male   62        Business      Economy                1284   
4996  4997    Male   42        Business      Economy                 729   
4997  4998  Female   61        Business     Business                1158   
4998  4999  Female   67        Business      Premium                 677   
4999  5000    Male   62        Business     Business                1478   

      Departure/Arrival Rating  B

In [13]:
# ---------- Basic inspection & descriptive stats ----------

def basic_inspection(df):
    print('\n=== Basic Info ===')
    print(df.info())
    print('\n=== Head ===')
    print(df.head())
    print('\n=== Describe (numerical) ===')
    print(df.describe().T)
    print('\n=== Describe (categorical) ===')
    print(df.describe(include=['object', 'category']).T)


In [14]:
basic_inspection(df)


=== Basic Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        5000 non-null   int64 
 1   Gender                    5000 non-null   object
 2   Age                       5000 non-null   int64 
 3   Travel category           5000 non-null   object
 4   Travel Class              5000 non-null   object
 5   Distance Travelled        5000 non-null   int64 
 6   Departure/Arrival Rating  5000 non-null   int64 
 7   Booking Ease              5000 non-null   int64 
 8   Boarding Point            5000 non-null   object
 9   Food                      5000 non-null   int64 
 10  Seat Comfort              5000 non-null   int64 
 11  Entertainment             5000 non-null   int64 
 12  Leg Room                  5000 non-null   int64 
 13  Luggage Handling          5000 non-null   int64 
 14  Clea

In [15]:

# ---------- Descriptive questions implemented as functions ----------

def gender_distribution(df):
    if 'Gender' not in df.columns:
        print('No Gender column found')
        return
    counts = df['Gender'].value_counts(dropna=False)
    props = df['Gender'].value_counts(normalize=True, dropna=False)
    print('\nGender counts:\n', counts)
    print('\nGender proportions:\n', props)
    # relationship with satisfaction
    if 'Satisfaction' in df.columns:
        print('\nCrosstab: Gender vs Satisfaction')
        print(pd.crosstab(df['Gender'], df['Satisfaction'], normalize='index'))



In [16]:
gender_distribution(df)


Gender counts:
 Gender
Female    2530
Male      2470
Name: count, dtype: int64

Gender proportions:
 Gender
Female    0.506
Male      0.494
Name: proportion, dtype: float64

Crosstab: Gender vs Satisfaction
Satisfaction  dissatisfied  satisfied
Gender                               
Female            0.430830   0.569170
Male              0.412551   0.587449


In [17]:
def age_analysis(df):
    if 'Age' not in df.columns:
        print('No Age column found')
        return
    print('\nAverage age:', df['Age'].mean())
    print('Min age, Max age:', df['Age'].min(), df['Age'].max())
    bins = [0, 30, 45, 60, 200]
    labels = ['18-30','31-45','46-60','61+']
    df['age_bracket'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)
    print('\nAge bracket counts:')
    print(df['age_bracket'].value_counts())


In [18]:
age_analysis(df)


Average age: 46.467
Min age, Max age: 18 75

Age bracket counts:
age_bracket
46-60    1318
61+      1290
31-45    1241
18-30    1151
Name: count, dtype: int64


In [19]:

def travel_category_analysis(df):
    if 'Travel category' in df.columns:
        print('\nTravel category counts:')
        print(df['Travel category'].value_counts())
    if 'Distance Travelled' in df.columns and 'Travel category' in df.columns:
        print('\nAverage distance per travel category:')
        print(df.groupby('Travel category')['Distance Travelled'].mean())


In [20]:
travel_category_analysis(df)


Travel category counts:
Travel category
Business    2508
Personal    2492
Name: count, dtype: int64

Average distance per travel category:
Travel category
Business    1009.439793
Personal    1018.864366
Name: Distance Travelled, dtype: float64


In [21]:
def travel_class_ratings(df):
    cols = ['Seat Comfort', 'Food ', 'Travel Class']
    # note: many datasets have trailing spaces; adapt
    cols = [c for c in df.columns if c.strip().lower() in ['seat comfort','food','travel class']]
    # fallback sensible names
    for name in ['Seat Comfort','Food ','Travel Class','Travel Class']:
        if name in df.columns:
            pass
    # seat comfort per class
    sc_col_candidates = [c for c in df.columns if c.strip().lower()=='seat comfort']
    tc_col_candidates = [c for c in df.columns if 'travel class' in c.lower()]
    if sc_col_candidates and tc_col_candidates:
        sc_col = sc_col_candidates[0]
        tc_col = tc_col_candidates[0]
        print('\nAverage seat comfort per travel class:')
        print(df.groupby(tc_col)[sc_col].mean())
    # food rating between classes
    food_cols = [c for c in df.columns if c.strip().lower()=='food']
    if food_cols and tc_col_candidates:
        print('\nAverage food rating per travel class:')
        print(df.groupby(tc_col)[food_cols[0]].mean())


In [22]:
travel_class_ratings(df)


Average seat comfort per travel class:
Travel Class
Business    2.988533
Economy     3.002309
Premium     2.980137
Name: Seat Comfort, dtype: float64

Average food rating per travel class:
Travel Class
Business    3.024744
Economy     3.017321
Premium     3.008690
Name: Food, dtype: float64


In [23]:
def delay_analysis(df):
    for col in ['Departure Delay (min)', 'Arrival Delay (min)', 'Departure Delay','Arrival Delay']:
        if col in df.columns:
            # compute averages
            print(f"\nAverage of {col}: {df[col].dropna().mean():.2f}")
    # impact on satisfaction
    if 'Satisfaction' in df.columns:
        potential_cols = [c for c in df.columns if 'delay' in c.lower()]
        if potential_cols:
            print('\nMean delays by satisfaction level:')
            print(df.groupby('Satisfaction')[potential_cols].mean())


In [24]:
delay_analysis(df)


Average of Departure Delay (min): 14.88

Average of Arrival Delay (min): 20.07

Mean delays by satisfaction level:
              Departure Delay (min)  Arrival Delay (min)
Satisfaction                                            
dissatisfied              15.324798            19.824561
satisfied                 14.551020            20.252508


In [25]:
# ---------- Data cleaning helpers ----------

def handle_missing(df, strategy='auto'):
    # simple rules: if column numeric and <30% missing -> median; >30% drop? we'll impute with median for numeric
    # categorical: fill with 'Unknown' if <50% missing else drop
    df = df.copy()
    missing_summary = df.isna().mean()
    for col, frac in missing_summary.items():
        if frac == 0:
            continue
        if df[col].dtype.kind in 'biufc':
            if frac < 0.5:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df.drop(columns=[col], inplace=True)
                print(f'Dropped numeric column {col} (missing {frac:.2%})')
        else:
            if frac < 0.7:
                df[col].fillna('Unknown', inplace=True)
            else:
                df.drop(columns=[col], inplace=True)
                print(f'Dropped categorical column {col} (missing {frac:.2%})')
    return df


In [26]:
handle_missing(df)

Unnamed: 0,id,Gender,Age,Travel category,Travel Class,Distance Travelled,Departure/Arrival Rating,Booking Ease,Boarding Point,Food,Seat Comfort,Entertainment,Leg Room,Luggage Handling,Cleanliness,Departure Delay (min),Arrival Delay (min),Satisfaction,age_bracket
0,1,Male,27,Personal,Premium,1693,3,3,New Samantha,4,4,2,4,4,3,19,13,satisfied,18-30
1,2,Male,35,Business,Premium,1229,5,5,Nicoleville,3,3,1,4,1,5,13,13,satisfied,31-45
2,3,Female,71,Personal,Premium,573,3,3,Wilkersonland,3,3,5,2,3,4,21,0,satisfied,61+
3,4,Male,19,Business,Business,1640,2,3,Brianville,5,4,4,5,1,5,30,15,satisfied,18-30
4,5,Male,39,Business,Business,358,2,5,South Donna,2,5,2,4,3,3,12,30,dissatisfied,31-45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Male,62,Business,Economy,1284,2,4,North Markside,1,4,3,5,2,5,14,39,satisfied,61+
4996,4997,Male,42,Business,Economy,729,3,5,Lake Barbaraland,3,5,5,3,4,5,22,0,satisfied,31-45
4997,4998,Female,61,Business,Business,1158,2,2,West Theresaborough,3,5,1,5,3,2,46,9,satisfied,61+
4998,4999,Female,67,Business,Premium,677,1,1,West Christopher,2,5,2,3,4,3,23,27,dissatisfied,61+


In [27]:
def handle_outliers(df, numeric_cols=None):
    df = df.copy()
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    dropped = []
    for col in numeric_cols:
        if col=='id':
            continue
        series = df[col].dropna()
        if series.empty:
            continue
        # IQR-based
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        is_out = ~df[col].between(lower, upper)
        frac = is_out.mean()
        if frac>0 and frac < 0.05:
            df = df[~is_out]
            dropped.append((col, frac))
            print(f'Dropped {frac:.2%} rows for outliers in {col}')
        else:
            print(f'Left outliers in {col} (fraction={frac:.2%})')
    return df, dropped


In [28]:
handle_outliers(df)

Left outliers in Age (fraction=0.00%)
Left outliers in Distance Travelled (fraction=0.00%)
Left outliers in Departure/Arrival Rating (fraction=0.00%)
Left outliers in Booking Ease (fraction=0.00%)
Left outliers in Food (fraction=0.00%)
Left outliers in Seat Comfort (fraction=0.00%)
Left outliers in Entertainment (fraction=0.00%)
Left outliers in Leg Room (fraction=0.00%)
Left outliers in Luggage Handling (fraction=0.00%)
Left outliers in Cleanliness (fraction=0.00%)
Dropped 0.44% rows for outliers in Departure Delay (min)
Dropped 0.28% rows for outliers in Arrival Delay (min)


(        id  Gender  Age Travel category Travel Class  Distance Travelled  \
 0        1    Male   27        Personal      Premium                1693   
 1        2    Male   35        Business      Premium                1229   
 2        3  Female   71        Personal      Premium                 573   
 3        4    Male   19        Business     Business                1640   
 4        5    Male   39        Business     Business                 358   
 ...    ...     ...  ...             ...          ...                 ...   
 4994  4995  Female   58        Business     Business                 524   
 4995  4996    Male   62        Business      Economy                1284   
 4996  4997    Male   42        Business      Economy                 729   
 4998  4999  Female   67        Business      Premium                 677   
 4999  5000    Male   62        Business     Business                1478   
 
       Departure/Arrival Rating  Booking Ease     Boarding Point  Food  \


In [29]:
# ---------- Feature engineering ----------

def feature_engineering(df):
    df = df.copy()
    # total delay
    dd = None
    ad = None
    for c in df.columns:
        if 'departure' in c.lower() and 'delay' in c.lower():
            dd = c
        if 'arrival' in c.lower() and 'delay' in c.lower():
            ad = c
    if dd and ad:
        df['total_delay'] = df[dd].fillna(0) + df[ad].fillna(0)
    elif dd:
        df['total_delay'] = df[dd].fillna(0)
    elif ad:
        df['total_delay'] = df[ad].fillna(0)

    # binary target: satisfied vs not
    if 'Satisfaction' in df.columns:
        df['target'] = df['Satisfaction'].apply(lambda x: 1 if str(x).strip().lower().startswith('s') else 0)
    # simplify travel class label
    tc = [c for c in df.columns if 'travel class' in c.lower()]
    if tc:
        df['travel_class_simple'] = df[tc[0]].astype(str).str.lower()
    return df



In [30]:
feature_engineering(df)

Unnamed: 0,id,Gender,Age,Travel category,Travel Class,Distance Travelled,Departure/Arrival Rating,Booking Ease,Boarding Point,Food,...,Leg Room,Luggage Handling,Cleanliness,Departure Delay (min),Arrival Delay (min),Satisfaction,age_bracket,total_delay,target,travel_class_simple
0,1,Male,27,Personal,Premium,1693,3,3,New Samantha,4,...,4,4,3,19,13,satisfied,18-30,32,1,premium
1,2,Male,35,Business,Premium,1229,5,5,Nicoleville,3,...,4,1,5,13,13,satisfied,31-45,26,1,premium
2,3,Female,71,Personal,Premium,573,3,3,Wilkersonland,3,...,2,3,4,21,0,satisfied,61+,21,1,premium
3,4,Male,19,Business,Business,1640,2,3,Brianville,5,...,5,1,5,30,15,satisfied,18-30,45,1,business
4,5,Male,39,Business,Business,358,2,5,South Donna,2,...,4,3,3,12,30,dissatisfied,31-45,42,0,business
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Male,62,Business,Economy,1284,2,4,North Markside,1,...,5,2,5,14,39,satisfied,61+,53,1,economy
4996,4997,Male,42,Business,Economy,729,3,5,Lake Barbaraland,3,...,3,4,5,22,0,satisfied,31-45,22,1,economy
4997,4998,Female,61,Business,Business,1158,2,2,West Theresaborough,3,...,5,3,2,46,9,satisfied,61+,55,1,business
4998,4999,Female,67,Business,Premium,677,1,1,West Christopher,2,...,3,4,3,23,27,dissatisfied,61+,50,0,premium


In [33]:
# ---------- Preprocessing + modeling pipeline ----------

def build_pipeline(df, target_col='target'):
    df = df.copy()
    # select features
    # drop id, Satisfaction original
    drop_cols = [c for c in df.columns if c.lower() in ['id','satisfaction','target']]
    features = df.drop(columns=drop_cols, errors='ignore')
    numeric_feats = features.select_dtypes(include=[np.number]).columns.tolist()
    categorical_feats = features.select_dtypes(include=['object','category']).columns.tolist()

    print('\nNumeric features:', numeric_feats)
    print('Categorical features:', categorical_feats)

    # numeric pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    # categorical pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_feats),
            ('cat', categorical_transformer, categorical_feats)
        ],
        remainder='drop'
    )

    return preprocessor, numeric_feats, categorical_feats


In [37]:
processor,numeric_feats,categorical_feats=build_pipeline(df)


Numeric features: ['Age', 'Distance Travelled', 'Departure/Arrival Rating', 'Booking Ease', 'Food', 'Seat Comfort', 'Entertainment', 'Leg Room', 'Luggage Handling', 'Cleanliness', 'Departure Delay (min)', 'Arrival Delay (min)']
Categorical features: ['Gender', 'Travel category', 'Travel Class', 'Boarding Point', 'age_bracket']


In [46]:
def train_and_evaluate(df, preprocessor, numeric_feats, categorical_feats, model=None):
    print(df.columns)
    if 'Satisfaction' not in df.columns:
        raise ValueError('target column not found')
    X = df[numeric_feats + categorical_feats]
    y = df['Satisfaction']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    if model is None:
        model = RandomForestClassifier(n_estimators=100, random_state=42)

    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    print('\n=== Evaluation on test set ===')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, average='weighted', zero_division=0))
    print('Recall:', recall_score(y_test, y_pred, average='weighted', zero_division=0))
    print('F1:', f1_score(y_test, y_pred, average='weighted', zero_division=0))
    print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))
    print('\nClassification report:\n', classification_report(y_test, y_pred, zero_division=0))



    # feature importances if applicable
    if hasattr(pipe.named_steps['classifier'], 'feature_importances_'):
        # need to recover feature names after one-hot
        ohe = pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        ohe_columns = []
        if hasattr(ohe, 'get_feature_names_out'):
            ohe_columns = list(ohe.get_feature_names_out(categorical_feats))
        feat_names = numeric_feats + ohe_columns
        importances = pipe.named_steps['classifier'].feature_importances_
        fi = pd.Series(importances, index=feat_names).sort_values(ascending=False)
        print('\nTop feature importances:')
        print(fi.head(20))

    return pipe



In [47]:
train_and_evaluate(df,processor,numeric_feats,categorical_feats)

Index(['id', 'Gender', 'Age', 'Travel category', 'Travel Class',
       'Distance Travelled', 'Departure/Arrival Rating', 'Booking Ease',
       'Boarding Point', 'Food', 'Seat Comfort', 'Entertainment', 'Leg Room',
       'Luggage Handling', 'Cleanliness', 'Departure Delay (min)',
       'Arrival Delay (min)', 'Satisfaction', 'age_bracket'],
      dtype='object')

=== Evaluation on test set ===
Accuracy: 0.571
Precision: 0.5448517990383022
Recall: 0.571
F1: 0.5237438451376677

Confusion matrix:
 [[ 89 333]
 [ 96 482]]

Classification report:
               precision    recall  f1-score   support

dissatisfied       0.48      0.21      0.29       422
   satisfied       0.59      0.83      0.69       578

    accuracy                           0.57      1000
   macro avg       0.54      0.52      0.49      1000
weighted avg       0.54      0.57      0.52      1000


Top feature importances:
Distance Travelled          0.060772
Departure Delay (min)       0.056691
Seat Comfort           