In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib

# Load the dataset
df = pd.read_excel("Data_Train.xlsx")

2025-03-22 06:28:28.219980: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-22 06:28:28.376377: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-22 06:28:28.463170: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742624908.643365   52583 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742624908.693338   52583 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742624909.259154   52583 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [None]:
# Convert 'Duration' to minutes
def convert_duration_to_minutes(duration):
    if 'h' in duration and 'm' in duration:
        hours = int(duration.split('h')[0])
        minutes = int(duration.split('m')[0].split()[-1])
        return hours * 60 + minutes
    elif 'h' in duration:
        hours = int(duration.split('h')[0])
        return hours * 60
    elif 'm' in duration:
        minutes = int(duration.split('m')[0])
        return minutes
    else:
        return 0

df['Duration_Minutes'] = df['Duration'].apply(convert_duration_to_minutes)

# Extract features from 'Date_of_Journey'
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
df['Journey_Month'] = df['Date_of_Journey'].dt.month
df['Journey_Day'] = df['Date_of_Journey'].dt.day
df['Journey_DayOfWeek'] = df['Date_of_Journey'].dt.dayofweek

# Extract features from 'Dep_Time' and 'Arrival_Time'
df['Dep_Time'] = pd.to_datetime(df['Dep_Time'], format='%H:%M')
df['Arrival_Time'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M')
df['Dep_Hour'] = df['Dep_Time'].dt.hour
df['Dep_Minute'] = df['Dep_Time'].dt.minute
df['Arrival_Hour'] = df['Arrival_Time'].dt.hour
df['Arrival_Minute'] = df['Arrival_Time'].dt.minute

# Drop unnecessary columns
df.drop(columns=['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration', 'Additional_Info'], inplace=True)

# Separate features and target
X = df.drop(columns=['Price'])
y = df['Price']

# Preprocessing: OneHotEncoding for categorical features and scaling for numerical features
categorical_features = ['Airline', 'Source', 'Destination']
numerical_features = ['Total_Stops', 'Duration_Minutes', 'Journey_Month', 'Journey_Day', 'Journey_DayOfWeek',
                      'Dep_Hour', 'Dep_Minute', 'Arrival_Hour', 'Arrival_Minute']

# Create a preprocessing pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncode categorical features
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Build the Advanced Deep Learning Model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Callbacks for early stopping and learning rate reduction
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
]

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)

# Save the trained model
model.save("advanced_flight_price_forecast_model.h5")

# Save the preprocessor
joblib.dump(preprocessor, "advanced_preprocessor.pkl")

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

# Plot Original vs Predicted Prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='blue', label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2, label='Ideal Line')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Flight Prices')
plt.legend()
plt.grid(True)
plt.savefig("advanced_actual_vs_predicted_prices.png")  # Save the plot
plt.show()

# Plot Learning Curves
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.savefig("learning_curves.png")  # Save the plot
plt.show()
