In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.layers import Concatenate, Dense
from tensorflow.keras.models import Model
import kagglehub


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset (example: historical ticket pricing data)
path = kagglehub.dataset_download("ibrahimelsayed182/plane-ticket-price")
print("Path to dataset files:", path)
data_path = fr'{path}\ticket_pricing_data.csv'
data = pd.read_csv(r'C:\Users\Alex\.cache\kagglehub\datasets\ibrahimelsayed182\plane-ticket-price\versions\1\Data_Train.csv')

Path to dataset files: C:\Users\Alex\.cache\kagglehub\datasets\ibrahimelsayed182\plane-ticket-price\versions\1


In [8]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Dep_hour,Dep_minute,Day_of_Week,Month,Day,Duration_Minutes,Arrival_hour,Arrival_minute
0,3,2019-03-24,0,5,18,1900-01-01 22:20:00,NaT,non-stop,8,3897,24,3,22,20,6,3,24,170,,
1,1,2019-05-01,3,0,84,1900-01-01 05:50:00,1900-01-01 13:15:00,2 stops,8,7662,1,5,5,50,2,5,1,445,13.0,15.0
2,4,2019-06-09,2,1,118,1900-01-01 09:25:00,NaT,2 stops,8,13882,9,6,9,25,6,6,9,1140,,
3,3,2019-05-12,3,0,91,1900-01-01 18:05:00,1900-01-01 23:30:00,1 stop,8,6218,12,5,18,5,6,5,12,325,23.0,30.0
4,3,2019-03-01,0,5,29,1900-01-01 16:50:00,1900-01-01 21:35:00,1 stop,8,13302,1,3,16,50,4,3,1,285,21.0,35.0


In [20]:


# Preprocessing Date and Time columns
data['Date_of_Journey'] = pd.to_datetime(data['Date_of_Journey'], format='%d/%m/%Y')
data['Journey_day'] = data['Date_of_Journey'].dt.day
data['Journey_month'] = data['Date_of_Journey'].dt.month

# Function to clean and convert time
def clean_and_convert_time(time_str):
    if isinstance(time_str, str) and ' ' in time_str:  # Check if it's a string and contains a space
        return None  # Or assign a default time like '00:00'
    try:
        return pd.to_datetime(time_str, format='%H:%M', errors='coerce')
    except Exception:
        return None  # If conversion fails, return None

data['Dep_Time'] = data['Dep_Time'].apply(clean_and_convert_time)
data['Dep_hour'] = data['Dep_Time'].dt.hour
data['Dep_minute'] = data['Dep_Time'].dt.minute

data['Arrival_Time'] = data['Arrival_Time'].apply(clean_and_convert_time)
data['Arrival_hour'] = data['Arrival_Time'].dt.hour
data['Arrival_minute'] = data['Arrival_Time'].dt.minute

# Handle categorical features
label_encoders = {}
categorical_columns = ['Airline', 'Source', 'Destination', 'Route', 'Additional_Info']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save label encoder for possible inverse transformation later

# No need to parse 'Duration_Minutes' as it's already an integer
# Just make sure it's in the right format
if 'Duration_Minutes' in data.columns:
    data['Duration'] = data['Duration_Minutes']
else:
    print("Warning: 'Duration_Minutes' column not found in the DataFrame")

# Separate features and target variable
X = data.drop(['Price', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time'], axis=1)
y = data['Price']

# Normalize numerical columns
scaler = StandardScaler()
X[['Journey_day', 'Journey_month', 'Dep_hour', 'Dep_minute', 'Arrival_hour', 'Arrival_minute', 'Duration']] = scaler.fit_transform(
    X[['Journey_day', 'Journey_month', 'Dep_hour', 'Dep_minute', 'Arrival_hour', 'Arrival_minute', 'Duration']]
)

# Define a mapping dictionary for Total_Stops
stops_mapping = {
    'non-stop': 0,
    '1 stop': 1,
    '2 stops': 2,
    '3 stops': 3,
    '4 stops': 4
}

# Apply the mapping to the Total_Stops column
data['Total_Stops'] = data['Total_Stops'].map(stops_mapping)

# Check if there are any NaN values after mapping
if data['Total_Stops'].isnull().any():
    print("Warning: There are NaN values in the 'Total_Stops' column after mapping.")




In [21]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
def create_model_1(input_shape):
    model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=input_shape),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Output price
    ])
    model.compile(optimizer='adam', loss='mean_absolute_error')
    return model


In [13]:
def create_model_2(input_shape):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_shape=input_shape),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Output price
    ])
    model.compile(optimizer='adam', loss='mean_absolute_error')
    return model


In [14]:
def create_model_3(input_shape):
    model = models.Sequential([
        layers.Reshape((input_shape[0], 1), input_shape=input_shape),
        layers.Conv1D(64, 3, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(128, 3, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)  # Output price
    ])
    model.compile(optimizer='adam', loss='mean_absolute_error')
    return model


In [24]:
input_shape = (X_train.shape[1],)  # Shape of the input features

model1 = create_model_1(input_shape)
model2 = create_model_2(input_shape)
model3 = create_model_3(input_shape)

# Train the models
models_list = [model1, model2, model3]

for i, model in enumerate(models_list):
    print(f"Training Model {i+1}...")
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Training Model 1...
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 9099.1279 - val_loss: 9135.4531
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9052.8320 - val_loss: 9134.7041
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9020.8164 - val_loss: 9133.4492
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9023.2715 - val_loss: 9131.4189
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9081.6074 - val_loss: 9128.4219
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9188.1357 - val_loss: 9124.3721
Epoch 7/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9050.3848 - val_loss: 9119.2344
Epoch 8/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 9034.1221 - val_loss: 9113.0107
Epo

In [25]:
def averaging_ensemble(models, input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    outputs = [model(inputs) for model in models]
    avg_output = tf.keras.layers.Average()(outputs)
    ensemble_model = tf.keras.Model(inputs=inputs, outputs=avg_output)
    return ensemble_model

ensemble_avg = averaging_ensemble(models_list, input_shape)
ensemble_avg.compile(optimizer='adam', loss='mean_absolute_error')

# Evaluate the ensemble model
ensemble_avg.evaluate(X_test, y_test)


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 7994.3066 


7919.904296875

In [26]:


def stacking_ensemble(models, input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    outputs = [model(inputs) for model in models]
    concatenated = Concatenate()(outputs)
    
    meta_output = Dense(1)(concatenated)
    stacked_model = Model(inputs=inputs, outputs=meta_output)
    return stacked_model

ensemble_stack = stacking_ensemble(models_list, input_shape)
ensemble_stack.compile(optimizer='adam', loss='mean_absolute_error')

# Evaluate stacking model
ensemble_stack.evaluate(X_test, y_test)


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 8156.1250


8081.72216796875

In [27]:
# Predictions for ensemble models
y_pred_avg = ensemble_avg.predict(X_test)
y_pred_stack = ensemble_stack.predict(X_test)

# Calculate MAE for both models
mae_avg = mean_absolute_error(y_test, y_pred_avg)
mae_stack = mean_absolute_error(y_test, y_pred_stack)

print(f"MAE of Averaging Ensemble: {mae_avg}")
print(f"MAE of Stacking Ensemble: {mae_stack}")


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
MAE of Averaging Ensemble: 7919.908203125
MAE of Stacking Ensemble: 8081.7255859375
