In [6]:
# Data Processing & Feature Engineering
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Tree-based Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Deep Learning (RNN, LSTM) - TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout

# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

# Custom Modules
from data_preprocessing import load_data, apply_feature_engineering

In [8]:
df = load_data()

In [9]:
df_eng = apply_feature_engineering(df)

Starting feature engineering...
Converting date columns...
Extracting travel duration...
Imputing missing travel distances...
Processing departure times...
Processing airline and cabin class codes...
Applying Label Encoding...
Label Encoding complete!
Calculating days to departure...
Processing holiday features...
Dropping unnecessary columns...
Feature engineering complete!


In [10]:
df_eng.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isRefundable,isNonStop,totalFare,seatsRemaining,airlineCode,cabinClass,travelDistance,departureTimeHour,daysToDeparture,departureDayOfWeek,isWeekend,isHoliday,nearHoliday
0,2022-04-16,2022-04-17,0,1,149,False,True,248.6,9,70,42,947.0,12,1,6,True,True,True
1,2022-04-16,2022-04-17,0,1,150,False,True,248.6,4,70,42,947.0,6,1,6,True,True,True
2,2022-04-16,2022-04-17,0,1,150,False,True,248.6,9,70,42,947.0,11,1,6,True,True,True
3,2022-04-16,2022-04-17,0,1,152,False,True,248.6,8,70,42,947.0,13,1,6,True,True,True
4,2022-04-16,2022-04-17,0,1,154,False,True,248.6,9,70,42,947.0,9,1,6,True,True,True


In [11]:
# Sorted DataFrame
sorted_df = df_eng.sort_values(
    by=['flightDate', 'departureTimeHour', 'startingAirport', 'destinationAirport', 'daysToDeparture'],
    ascending=[True, True, True, True, False]
)

sorted_df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isRefundable,isNonStop,totalFare,seatsRemaining,airlineCode,cabinClass,travelDistance,departureTimeHour,daysToDeparture,departureDayOfWeek,isWeekend,isHoliday,nearHoliday
1533,2022-04-16,2022-04-17,3,0,172,False,True,273.99,4,98,42,1207.0,0,1,6,True,True,True
1534,2022-04-16,2022-04-17,3,0,160,False,True,296.61,9,70,42,1207.0,0,1,6,True,True,True
1546,2022-04-16,2022-04-17,3,0,462,False,False,304.11,3,34,15,1574.0,0,1,6,True,True,True
1573,2022-04-16,2022-04-17,3,1,431,False,False,321.1,4,34,15,2074.0,0,1,6,True,True,True
1582,2022-04-16,2022-04-17,3,1,385,False,False,322.6,9,83,15,2154.0,0,1,6,True,True,True


In [12]:
# Unique combinations
unique_sorted_df = sorted_df.drop_duplicates(
    subset=['flightDate', 'departureTimeHour', 'startingAirport', 'destinationAirport']
)

In [13]:
sorted_df[['flightDate', 'departureTimeHour', 'startingAirport', 'destinationAirport', 'daysToDeparture']].head(60)

Unnamed: 0,flightDate,departureTimeHour,startingAirport,destinationAirport,daysToDeparture
1533,2022-04-17,0,3,0,1
1534,2022-04-17,0,3,0,1
1546,2022-04-17,0,3,0,1
1573,2022-04-17,0,3,1,1
1582,2022-04-17,0,3,1,1
1584,2022-04-17,0,3,1,1
1626,2022-04-17,0,3,2,1
1630,2022-04-17,0,3,2,1
1631,2022-04-17,0,3,2,1
1675,2022-04-17,0,3,4,1


In [14]:
sorted_df[['flightDate', 'departureTimeHour', 'startingAirport', 'destinationAirport', 'daysToDeparture']].iloc[40000000:40000030]

Unnamed: 0,flightDate,departureTimeHour,startingAirport,destinationAirport,daysToDeparture
30194666,2022-08-12,14,14,4,47
30724300,2022-08-12,14,14,4,46
31265936,2022-08-12,14,14,4,45
31817281,2022-08-12,14,14,4,44
32370436,2022-08-12,14,14,4,43
32925772,2022-08-12,14,14,4,42
33485408,2022-08-12,14,14,4,41
34035168,2022-08-12,14,14,4,40
34583271,2022-08-12,14,14,4,39
35253331,2022-08-12,14,14,4,38


In [15]:
from tqdm import tqdm

def add_historical_price_features(df):
    """
    Adding historical price features (t-1, t-2, ..., t-7) to the dataframe.
    These features represent the price of the same flight when searched 
    1, 2, ..., 7 days before the current search date.
    """
    
    # DF copy
    result_df = df.copy()
    
    # New columns with NaN
    for i in range(1, 8):
        result_df[f'price_t_minus_{i}'] = np.nan
    
    # Dictonary key (flightDate, departureTimeHour, startingAirport, destinationAirport) and prices as values
    flight_price_dict = {}
    
    # Filling the dictionary
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Building price dictionary"):
        flight_key = (
            row['flightDate'],
            row['departureTimeHour'],
            row['startingAirport'],
            row['destinationAirport']
        )
        
        search_date = row['searchDate']
        price = row['totalFare']
        
        if flight_key not in flight_price_dict:
            flight_price_dict[flight_key] = {}
        
        flight_price_dict[flight_key][search_date] = price
    
    #  Historical prices
    for idx, row in tqdm(result_df.iterrows(), total=len(result_df), desc="Filling historical prices"):
        flight_key = (
            row['flightDate'],
            row['departureTimeHour'],
            row['startingAirport'],
            row['destinationAirport']
        )
        
        current_search_date = row['searchDate']
        
        # Checking if we have price data for this flight
        if flight_key in flight_price_dict:
            # For each t-i, checking if we have data from i days ago
            for i in range(1, 8):
                prev_date = current_search_date - pd.Timedelta(days=i)
                
                if prev_date in flight_price_dict[flight_key]:
                    result_df.at[idx, f'price_t_minus_{i}'] = flight_price_dict[flight_key][prev_date]
    
    print("Historical price features added successfully!")
    return result_df

In [16]:
df_with_history = add_historical_price_features(df_eng)

Building price dictionary: 100%|██████████| 82138753/82138753 [24:32<00:00, 55770.66it/s]
Filling historical prices:   0%|          | 0/82138753 [00:00<?, ?it/s]

: 

In [None]:
# Checking 
columns_to_show = ['searchDate', 'flightDate', 'startingAirport', 'destinationAirport', 
                   'departureTimeHour', 'totalFare', 'price_t_minus_1', 'price_t_minus_2', 
                   'price_t_minus_3', 'price_t_minus_4', 'price_t_minus_5', 'price_t_minus_6', 
                   'price_t_minus_7']
print("\nSample data with historical prices:")
display(df_with_history[columns_to_show].head(10))

#  Number of on-null values for each historical price
print("\nCoverage statistics:")
non_null_counts = {}
for i in range(1, 8):
    col = f'price_t_minus_{i}'
    non_null_counts[col] = df_with_history[col].notna().sum()
    non_null_percentage = (non_null_counts[col] / len(df_with_history)) * 100
    print(f"{col}: {non_null_counts[col]} non-null values ({non_null_percentage:.2f}%)")