In [1]:
# Data Processing & Feature Engineering
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Tree-based Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor

# Deep Learning (RNN, LSTM) - TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout

# Visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

# Custom Modules
from data_preprocessing import load_data, apply_feature_engineering


In [2]:
df = pd.read_csv('data/fe_sample2.csv')
df = df.drop(columns= 'Unnamed: 0')

FileNotFoundError: [Errno 2] No such file or directory: 'data/fe_sample2.csv'

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   searchDate                800000 non-null  object 
 1   flightDate                800000 non-null  object 
 2   startingAirport           800000 non-null  object 
 3   destinationAirport        800000 non-null  object 
 4   travelDuration            800000 non-null  object 
 5   isBasicEconomy            800000 non-null  bool   
 6   isRefundable              800000 non-null  bool   
 7   isNonStop                 800000 non-null  bool   
 8   totalFare                 800000 non-null  float64
 9   seatsRemaining            800000 non-null  int64  
 10  totalTravelDistance       751962 non-null  float64
 11  segmentsDepartureTimeRaw  800000 non-null  object 
 12  segmentsAirlineCode       800000 non-null  object 
 13  segmentsCabinCode         800000 non-null  o

In [5]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeRaw,segmentsAirlineCode,segmentsCabinCode
0,2022-04-16,2022-04-17,ATL,BOS,PT2H29M,False,False,True,248.6,9,947.0,2022-04-17T12:57:00.000-04:00,DL,coach
1,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,4,947.0,2022-04-17T06:30:00.000-04:00,DL,coach
2,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,9,947.0,2022-04-17T11:35:00.000-04:00,DL,coach
3,2022-04-16,2022-04-17,ATL,BOS,PT2H32M,False,False,True,248.6,8,947.0,2022-04-17T13:59:00.000-04:00,DL,coach
4,2022-04-16,2022-04-17,ATL,BOS,PT2H34M,False,False,True,248.6,9,947.0,2022-04-17T09:59:00.000-04:00,DL,coach


In [4]:
print(df.head(3))

   searchDate  flightDate startingAirport destinationAirport travelDuration  \
0  2022-04-16  2022-04-17             ATL                BOS        PT2H29M   
1  2022-04-16  2022-04-17             ATL                BOS        PT2H30M   
2  2022-04-16  2022-04-17             ATL                BOS        PT2H30M   

   isBasicEconomy  isRefundable  isNonStop  totalFare  seatsRemaining  \
0           False         False       True      248.6               9   
1           False         False       True      248.6               4   
2           False         False       True      248.6               9   

   totalTravelDistance       segmentsDepartureTimeRaw segmentsAirlineCode  \
0                947.0  2022-04-17T12:57:00.000-04:00                  DL   
1                947.0  2022-04-17T06:30:00.000-04:00                  DL   
2                947.0  2022-04-17T11:35:00.000-04:00                  DL   

  segmentsCabinCode  
0             coach  
1             coach  
2             c

In [5]:
from data_preprocessing import load_data, apply_feature_engineering

In [54]:
df_eng = apply_feature_engineering(df)

Starting feature engineering...
Converting date columns...
Extracting travel duration...
Imputing missing travel distances...
Processing departure times...
Extracting departure hour and float...
Processing airline and cabin class codes...
Applying Label Encoding...
Label Encoding complete!
Calculating days to departure...
Processing holiday features...
Dropping unnecessary columns...
Feature engineering complete!


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   searchDate          800000 non-null  datetime64[ns]
 1   flightDate          800000 non-null  datetime64[ns]
 2   startingAirport     800000 non-null  int64         
 3   destinationAirport  800000 non-null  int64         
 4   travelDuration      800000 non-null  int64         
 5   isRefundable        800000 non-null  bool          
 6   isNonStop           800000 non-null  bool          
 7   totalFare           800000 non-null  float64       
 8   seatsRemaining      800000 non-null  int64         
 9   airlineCode         800000 non-null  int64         
 10  cabinClass          800000 non-null  int64         
 11  travelDistance      800000 non-null  float64       
 12  departureTimeHour   800000 non-null  int32         
 13  departureTimeFloat  800000 no

In [55]:
df = df_eng

In [56]:
# Step 1: Aggregate time into flightDate
# Convert departureTimeFloat into hours and minutes, then add it to flightDate
df['flightDate'] = df['flightDate'] + pd.to_timedelta(df['departureTimeFloat'] * 60, unit='m')

# Ensure datetime format only includes hours and minutes
df['flightDate'] = pd.to_datetime(df['flightDate']).dt.strftime('%Y-%m-%d %H:%M')

# Step 2: Group by flightDate (including time), startingAirport, and destinationAirport
# Step 3: Keep the cheapest fare for each group
df = df.loc[df.groupby(['flightDate', 'startingAirport', 'destinationAirport'])['totalFare'].idxmin()].reset_index(drop=True)

# Step 4: Sort in descending order of daysToDeparture
df = df.sort_values(by=['flightDate', 'daysToDeparture'], ascending=[True, False])

In [57]:
df.head(20)

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isRefundable,isNonStop,totalFare,seatsRemaining,airlineCode,cabinClass,travelDistance,departureTimeHour,departureTimeFloat,daysToDeparture,departureDayOfWeek,isWeekend,isHoliday,nearHoliday
0,2022-04-16,2022-04-17 00:00,1,3,282,False,True,313.6,7,41,25,1763.0,0,0.0,1,6,True,True,True
1,2022-04-16,2022-04-17 00:00,8,0,692,False,False,690.2,5,55,12,1087.0,0,0.0,1,6,True,True,True
2,2022-04-16,2022-04-17 00:00,8,3,283,False,True,238.6,3,47,25,1621.0,0,0.0,1,6,True,True,True
3,2022-04-16,2022-04-17 00:00,8,5,218,False,False,318.6,1,24,12,612.0,0,0.0,1,6,True,True,True
4,2022-04-16,2022-04-17 00:00,10,0,159,False,True,348.6,4,47,25,762.0,0,0.0,1,6,True,True,True
5,2022-04-16,2022-04-17 00:00,10,5,127,False,True,248.6,9,47,25,485.0,0,0.0,1,6,True,True,True
6,2022-04-16,2022-04-17 00:00,10,11,201,False,True,371.6,1,19,25,1104.0,0,0.0,1,6,True,True,True
7,2022-04-16,2022-04-17 00:00,10,12,1331,False,False,899.3,1,88,14,3140.0,0,0.0,1,6,True,True,True
8,2022-04-16,2022-04-17 00:00,10,14,263,False,False,698.6,9,55,12,932.0,0,0.0,1,6,True,True,True
9,2022-04-16,2022-04-17 00:00,11,3,972,False,False,1738.19,3,36,12,3100.0,0,0.0,1,6,True,True,True


In [46]:
df= df_eng

In [49]:
df['flightDate'] = df['flightDate'] + pd.to_timedelta(df['departureTimeFloat'].round(2) * 60, unit='m')
df['flightDate'] = df['flightDate'].dt.strftime('%Y-%m-%d %H:%M')

In [50]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isRefundable,isNonStop,totalFare,seatsRemaining,airlineCode,cabinClass,travelDistance,departureTimeHour,departureTimeFloat,daysToDeparture,departureDayOfWeek,isWeekend,isHoliday,nearHoliday
0,2022-04-16,2022-04-20 12:45,0,1,149,False,True,248.6,9,47,25,947.0,16,16.95,1,6,True,True,True
1,2022-04-16,2022-04-19 04:30,0,1,150,False,True,248.6,4,47,25,947.0,10,10.5,1,6,True,True,True
2,2022-04-16,2022-04-20 05:54,0,1,150,False,True,248.6,9,47,25,947.0,15,15.583333,1,6,True,True,True
3,2022-04-16,2022-04-20 17:54,0,1,152,False,True,248.6,8,47,25,947.0,17,17.983333,1,6,True,True,True
4,2022-04-16,2022-04-19 21:54,0,1,154,False,True,248.6,9,47,25,947.0,13,13.983333,1,6,True,True,True


In [51]:
df = df.sort_values(by=['totalFare']).groupby(['flightDate', 'startingAirport', 'destinationAirport']).first().reset_index()
df = df.sort_values(by=['daysToDeparture'], ascending=False)

In [52]:
df[['flightDate', 'startingAirport', 'destinationAirport', 'daysToDeparture']].head(20)

Unnamed: 0,flightDate,startingAirport,destinationAirport,daysToDeparture
389573,2022-06-20 23:54,11,13,60
371847,2022-06-16 20:00,15,8,60
371846,2022-06-16 20:00,15,6,60
371845,2022-06-16 20:00,15,0,60
371839,2022-06-16 20:00,9,14,60
371838,2022-06-16 20:00,9,13,60
384451,2022-06-19 01:00,11,8,60
384452,2022-06-19 01:00,11,13,60
384453,2022-06-19 01:00,11,15,60
371837,2022-06-16 20:00,9,10,60


In [15]:
df_sorted = df_eng.groupby(by=['flightDate', 'startingAirport', 'destinationAirport'], group_keys=False).apply(lambda x: x.sort_values(by='daysToDeparture'))


In [18]:
df_sorted[['flightDate', 'searchDate', 'startingAirport', 'destinationAirport', 'daysToDeparture', 'airlineCode']].head(20)

Unnamed: 0,flightDate,searchDate,startingAirport,destinationAirport,daysToDeparture,airlineCode
0,2022-04-17,2022-04-16,0,1,1,47
20,2022-04-17,2022-04-16,0,1,1,24
21,2022-04-17,2022-04-16,0,1,1,24
22,2022-04-17,2022-04-16,0,1,1,90
23,2022-04-17,2022-04-16,0,1,1,90
24,2022-04-17,2022-04-16,0,1,1,90
25,2022-04-17,2022-04-16,0,1,1,73
26,2022-04-17,2022-04-16,0,1,1,90
27,2022-04-17,2022-04-16,0,1,1,90
28,2022-04-17,2022-04-16,0,1,1,47


In [22]:
df_cheapest = df_eng.loc[
    df_eng.groupby(['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture'])['totalFare'].idxmin()
].reset_index(drop=True)

print("Shape after filtering:", df_cheapest.shape)
df_cheapest[['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture']].head(50)

Shape after filtering: (20841, 19)


Unnamed: 0,startingAirport,destinationAirport,flightDate,daysToDeparture
0,0,1,2022-04-17,1
1,0,1,2022-04-18,1
2,0,1,2022-04-18,2
3,0,1,2022-04-19,1
4,0,1,2022-04-19,2
5,0,1,2022-04-19,3
6,0,1,2022-04-20,2
7,0,1,2022-04-20,3
8,0,1,2022-04-20,4
9,0,1,2022-04-21,3


In [35]:
df_cheapgroup = df_cheapest.sort_values(by= ['flightDate', 'startingAirport', 'destinationAirport'])[['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture']]

In [36]:
df_cheapgroup

Unnamed: 0,startingAirport,destinationAirport,flightDate,daysToDeparture
0,0,1,2022-04-17,1
90,0,2,2022-04-17,1
180,0,3,2022-04-17,1
270,0,4,2022-04-17,1
360,0,5,2022-04-17,1
...,...,...,...,...
20484,15,10,2022-06-16,60
20573,15,11,2022-06-16,60
20662,15,12,2022-06-16,60
20751,15,13,2022-06-16,60


In [26]:
df_sorted[['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture']].head(20)

Unnamed: 0,startingAirport,destinationAirport,flightDate,daysToDeparture
0,0,1,2022-04-17,1
90,0,2,2022-04-17,1
180,0,3,2022-04-17,1
270,0,4,2022-04-17,1
360,0,5,2022-04-17,1
450,0,6,2022-04-17,1
540,0,7,2022-04-17,1
630,0,8,2022-04-17,1
720,0,9,2022-04-17,1
810,0,10,2022-04-17,1


In [8]:
print(df_eng.head(3))

  searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
0 2022-04-16 2022-04-17                0                   1             149   
1 2022-04-16 2022-04-17                0                   1             150   
2 2022-04-16 2022-04-17                0                   1             150   

   isRefundable  isNonStop  totalFare  seatsRemaining  airlineCode  \
0         False       True      248.6               9           47   
1         False       True      248.6               4           47   
2         False       True      248.6               9           47   

   cabinClass  travelDistance  departureTimeHour  departureTimeFloat  \
0          25           947.0                 16           16.950000   
1          25           947.0                 10           10.500000   
2          25           947.0                 15           15.583333   

   daysToDeparture  departureDayOfWeek  isWeekend  isHoliday  nearHoliday  
0                1               

In [10]:
# Sort the data before applying group-based operations
df_eng = df_eng.sort_values(by=['startingAirport', 'destinationAirport', 'flightDate', 'airlineCode', 'cabinClass', 'daysToDeparture'])
print(df_eng.head(10))

   searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
16 2022-04-16 2022-04-17                0                   1             158   
35 2022-04-16 2022-04-17                0                   1             164   
6  2022-04-16 2022-04-17                0                   1             252   
7  2022-04-16 2022-04-17                0                   1             318   
8  2022-04-16 2022-04-17                0                   1             332   
9  2022-04-16 2022-04-17                0                   1             398   
11 2022-04-16 2022-04-17                0                   1             345   
12 2022-04-16 2022-04-17                0                   1             359   
13 2022-04-16 2022-04-17                0                   1             438   
17 2022-04-16 2022-04-17                0                   1             257   

    isRefundable  isNonStop  totalFare  seatsRemaining  airlineCode  \
16         False       True     300.1

In [11]:
df_eng['daysToDeparture'].unique()

<IntegerArray>
[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 58, 59, 60]
Length: 60, dtype: Int64

In [12]:
print(df_eng.shape)

(800000, 19)


In [13]:
# Define how many days of lag to create
lag_days = 7

# Create lag features within each flight group
for lag in range(1, lag_days + 1):
    df_eng[f'totalFare_T-{lag}'] = df_eng.groupby(
        ['startingAirport', 'destinationAirport', 'flightDate', 'airlineCode', 'cabinClass']
    )['totalFare'].shift(lag)

# Drop rows where any lag feature is missing (ensuring full 7-day history)
df_eng = df_eng.dropna(subset=[f'totalFare_T-{lag}' for lag in range(1, lag_days + 1)])

# Reset index after dropping rows
df_eng = df_eng.reset_index(drop=True)


In [14]:
print(df_eng.head(10))

  searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
0 2022-04-16 2022-04-17                0                   1             257   
1 2022-04-16 2022-04-17                0                   1             276   
2 2022-04-16 2022-04-17                0                   1             285   
3 2022-04-16 2022-04-17                0                   1             362   
4 2022-04-16 2022-04-17                0                   1             374   
5 2022-04-16 2022-04-17                0                   1             155   
6 2022-04-16 2022-04-17                0                   1             158   
7 2022-04-17 2022-04-18                0                   1             438   
8 2022-04-17 2022-04-18                0                   1             345   
9 2022-04-17 2022-04-18                0                   1             359   

   isRefundable  isNonStop  totalFare  seatsRemaining  airlineCode  ...  \
0         False      False     302.11       

In [16]:
print(df_eng.shape)

(299995, 26)


In [17]:
# Check number of unique airlines per (startingAirport, destinationAirport, flightDate)
airline_counts = df_eng.groupby(['startingAirport', 'destinationAirport', 'flightDate'])['airlineCode'].nunique()

# Check number of unique cabin classes per (startingAirport, destinationAirport, flightDate)
cabin_counts = df_eng.groupby(['startingAirport', 'destinationAirport', 'flightDate'])['cabinClass'].nunique()

# Display summary stats
print("Airline count per flight route:")
print(airline_counts.describe())  # Check min, max, mean, std

print("\nCabin class count per flight route:")
print(cabin_counts.describe())  # Check min, max, mean, std

# Check sample cases where there's only 1 airline per flight route
print("\nSample rows with only 1 airline per flight route:")
print(airline_counts[airline_counts == 1].head(10))

# Check sample cases where there's only 1 cabin class per flight route
print("\nSample rows with only 1 cabin class per flight route:")
print(cabin_counts[cabin_counts == 1].head(10))

Airline count per flight route:
count    12018.000000
mean         2.760359
std          1.599176
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         11.000000
Name: airlineCode, dtype: float64

Cabin class count per flight route:
count    12018.000000
mean         1.625395
std          0.701378
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          5.000000
Name: cabinClass, dtype: float64

Sample rows with only 1 airline per flight route:
startingAirport  destinationAirport  flightDate
0                1                   2022-06-04    1
                                     2022-06-07    1
                                     2022-06-10    1
                                     2022-06-11    1
                 2                   2022-05-09    1
                                     2022-05-12    1
                                     2022-05-17    1
                                     2022-

In [20]:
df_eng = apply_feature_engineering(df)

Starting feature engineering...
Converting date columns...
Extracting travel duration...
Imputing missing travel distances...
Processing departure times...
Extracting departure hour and float...
Processing airline and cabin class codes...
Applying Label Encoding...
Label Encoding complete!
Calculating days to departure...
Processing holiday features...
Dropping unnecessary columns...
Feature engineering complete!


In [None]:
# df_eng_copy = df_eng
# df_eng= df_eng_copy

In [65]:
df_eng = df_eng.sort_values(by=['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture', 'totalFare'])
print(df_eng.head(10))  # Show first 10 rows to confirm sorting

  searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
0 2022-04-16 2022-04-17                0                   1             149   
1 2022-04-16 2022-04-17                0                   1             150   
2 2022-04-16 2022-04-17                0                   1             150   
3 2022-04-16 2022-04-17                0                   1             152   
4 2022-04-16 2022-04-17                0                   1             154   
5 2022-04-16 2022-04-17                0                   1             158   
6 2022-04-16 2022-04-17                0                   1             252   
7 2022-04-16 2022-04-17                0                   1             318   
8 2022-04-16 2022-04-17                0                   1             332   
9 2022-04-16 2022-04-17                0                   1             398   

   isRefundable  isNonStop  totalFare  seatsRemaining  airlineCode  \
0         False       True      248.6            

In [66]:
df_cheapest = df_eng.loc[
    df_eng.groupby(['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture'])['totalFare'].idxmin()
].reset_index(drop=True)

print("Shape after filtering:", df_cheapest.shape)
print(df_cheapest.head(10))


Shape after filtering: (20841, 19)
  searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
0 2022-04-16 2022-04-17                0                   1             149   
1 2022-04-17 2022-04-18                0                   1             152   
2 2022-04-16 2022-04-18                0                   1             150   
3 2022-04-18 2022-04-19                0                   1             149   
4 2022-04-17 2022-04-19                0                   1             164   
5 2022-04-16 2022-04-19                0                   1             164   
6 2022-04-18 2022-04-20                0                   1             318   
7 2022-04-17 2022-04-20                0                   1             318   
8 2022-04-16 2022-04-20                0                   1             252   
9 2022-04-18 2022-04-21                0                   1             318   

   isRefundable  isNonStop  totalFare  seatsRemaining  airlineCode  \
0         Fals

In [67]:
df_cheapest = df_cheapest.drop(columns=['airlineCode', 'cabinClass'])

print("Shape after dropping airline & cabin class:", df_cheapest.shape)
print(df_cheapest.head(10))


Shape after dropping airline & cabin class: (20841, 17)
  searchDate flightDate  startingAirport  destinationAirport  travelDuration  \
0 2022-04-16 2022-04-17                0                   1             149   
1 2022-04-17 2022-04-18                0                   1             152   
2 2022-04-16 2022-04-18                0                   1             150   
3 2022-04-18 2022-04-19                0                   1             149   
4 2022-04-17 2022-04-19                0                   1             164   
5 2022-04-16 2022-04-19                0                   1             164   
6 2022-04-18 2022-04-20                0                   1             318   
7 2022-04-17 2022-04-20                0                   1             318   
8 2022-04-16 2022-04-20                0                   1             252   
9 2022-04-18 2022-04-21                0                   1             318   

   isRefundable  isNonStop  totalFare  seatsRemaining  travelDi

In [68]:
df_cheapest.groupby(['startingAirport', 'destinationAirport', 'flightDate'])['daysToDeparture'].median().describe()

count      14270.0
mean     29.927786
std      17.643629
min            1.0
25%           14.5
50%           30.0
75%           45.0
max           60.0
Name: daysToDeparture, dtype: Float64

In [69]:
df_cheapest.groupby(['startingAirport', 'destinationAirport'])['daysToDeparture'].nunique().describe()

count    234.000000
mean      59.982906
std        0.129900
min       59.000000
25%       60.000000
50%       60.000000
75%       60.000000
max       60.000000
Name: daysToDeparture, dtype: float64

In [70]:
print(df_cheapest.groupby(['startingAirport', 'destinationAirport', 'flightDate'])['daysToDeparture'].describe())

                                               count  mean       std   min  \
startingAirport destinationAirport flightDate                                
0               1                  2022-04-17    1.0   1.0      <NA>   1.0   
                                   2022-04-18    2.0   1.5  0.707107   1.0   
                                   2022-04-19    3.0   2.0       1.0   1.0   
                                   2022-04-20    3.0   3.0       1.0   2.0   
                                   2022-04-21    3.0   4.0       1.0   3.0   
...                                              ...   ...       ...   ...   
15              14                 2022-06-12    1.0  56.0      <NA>  56.0   
                                   2022-06-13    1.0  57.0      <NA>  57.0   
                                   2022-06-14    1.0  58.0      <NA>  58.0   
                                   2022-06-15    1.0  59.0      <NA>  59.0   
                                   2022-06-16    1.0  60.0      

In [71]:
# Group by (startingAirport, destinationAirport, flightDate)
df_routedays = df_cheapest.groupby(['startingAirport', 'destinationAirport', 'flightDate'])

# Filter out groups where min(daysToDeparture) is 30 or less
valid_routes = df_routedays.filter(lambda x: x['daysToDeparture'].min() > 30).reset_index()

# Print the number of valid routes and some sample data
print("Number of valid routes:", valid_routes.shape[0])
print(valid_routes.head(20))


Number of valid routes: 7016
    index searchDate flightDate  startingAirport  destinationAirport  \
0      60 2022-04-17 2022-05-18                0                   1   
1      61 2022-04-17 2022-05-19                0                   1   
2      62 2022-04-17 2022-05-20                0                   1   
3      63 2022-04-17 2022-05-21                0                   1   
4      64 2022-04-17 2022-05-22                0                   1   
5      65 2022-04-17 2022-05-23                0                   1   
6      66 2022-04-17 2022-05-24                0                   1   
7      67 2022-04-17 2022-05-25                0                   1   
8      68 2022-04-17 2022-05-26                0                   1   
9      69 2022-04-17 2022-05-27                0                   1   
10     70 2022-04-17 2022-05-28                0                   1   
11     71 2022-04-17 2022-05-29                0                   1   
12     72 2022-04-17 2022-05-30    

In [72]:
# Check unique daysToDeparture per (startingAirport, destinationAirport)
df_days_per_route = valid_routes.groupby(['startingAirport', 'destinationAirport'])['daysToDeparture'].nunique()

# Summary statistics
print("📊 Unique daysToDeparture per (route):")
print(df_days_per_route.describe())

# Check how many routes have 7+ unique daysToDeparture
print("\n🔍 Routes with at least 7 daysToDeparture:")
print(df_days_per_route[df_days_per_route >= 7])


📊 Unique daysToDeparture per (route):
count    234.000000
mean      29.982906
std        0.129900
min       29.000000
25%       30.000000
50%       30.000000
75%       30.000000
max       30.000000
Name: daysToDeparture, dtype: float64

🔍 Routes with at least 7 daysToDeparture:
startingAirport  destinationAirport
0                1                     30
                 2                     30
                 3                     30
                 4                     30
                 5                     30
                                       ..
15               10                    30
                 11                    30
                 12                    30
                 13                    30
                 14                    30
Name: daysToDeparture, Length: 234, dtype: int64


In [73]:
# Keep only the 234 valid routes
df_lags = valid_routes[
    valid_routes.set_index(['startingAirport', 'destinationAirport']).index.isin(df_days_per_route[df_days_per_route >= 30].index)
].copy()

# Ensure sorting before computing lags
df_lags = df_lags.sort_values(by=['startingAirport', 'destinationAirport', 'flightDate', 'daysToDeparture'])

# Compute lagged price features
lag_days = 7
for lag in range(1, lag_days + 1):
    df_lags[f'totalFare_T-{lag}'] = df_lags.groupby(
        ['startingAirport', 'destinationAirport']
    )['totalFare'].shift(lag)

# Check NaNs per lag column
print("NaN counts per lag feature AFTER FIX:")
print(df_lags[[f'totalFare_T-{lag}' for lag in range(1, 8)]].isna().sum())

# Display a sample of the dataset with lagged features
df_lags[['flightDate', 'startingAirport', 'destinationAirport', 'daysToDeparture', 'totalFare'] + [f'totalFare_T-{lag}' for lag in range(1, 8)]].head(20)


NaN counts per lag feature AFTER FIX:
totalFare_T-1     230
totalFare_T-2     460
totalFare_T-3     690
totalFare_T-4     920
totalFare_T-5    1150
totalFare_T-6    1380
totalFare_T-7    1610
dtype: int64


Unnamed: 0,flightDate,daysToDeparture,totalFare,totalFare_T-1,totalFare_T-2,totalFare_T-3,totalFare_T-4,totalFare_T-5,totalFare_T-6,totalFare_T-7
0,2022-05-18,31,66.97,,,,,,,
1,2022-05-19,32,87.59,66.97,,,,,,
2,2022-05-20,33,87.59,87.59,66.97,,,,,
3,2022-05-21,34,91.68,87.59,87.59,66.97,,,,
4,2022-05-22,35,130.59,91.68,87.59,87.59,66.97,,,
5,2022-05-23,36,160.59,130.59,91.68,87.59,87.59,66.97,,
6,2022-05-24,37,66.97,160.59,130.59,91.68,87.59,87.59,66.97,
7,2022-05-25,38,115.78,66.97,160.59,130.59,91.68,87.59,87.59,66.97
8,2022-05-26,39,114.78,115.78,66.97,160.59,130.59,91.68,87.59,87.59
9,2022-05-27,40,161.58,114.78,115.78,66.97,160.59,130.59,91.68,87.59


In [74]:
# Drop rows where we don’t have a full 7-day price history
df_lags = df_lags.dropna(subset=[f'totalFare_T-{lag}' for lag in range(1, 8)])

# Check shape after dropping NaNs
print("Shape after final lag calculation:", df_lags.shape)

# Display a sample to confirm the data looks correct
print(df_lags[['flightDate', 'daysToDeparture', 'totalFare'] + [f'totalFare_T-{lag}' for lag in range(1, 8)]].head(20))

Shape after final lag calculation: (5290, 25)
   flightDate  daysToDeparture  totalFare  totalFare_T-1  totalFare_T-2  \
7  2022-05-25               38     115.78          66.97         160.59   
8  2022-05-26               39     114.78         115.78          66.97   
9  2022-05-27               40     161.58         114.78         115.78   
10 2022-05-28               41     138.58         161.58         114.78   
11 2022-05-29               42     160.59         138.58         161.58   
12 2022-05-30               43     220.59         160.59         138.58   
13 2022-05-31               44     130.59         220.59         160.59   
14 2022-06-01               45      87.59         130.59         220.59   
15 2022-06-02               46      87.59          87.59         130.59   
16 2022-06-03               47     130.59          87.59          87.59   
17 2022-06-04               48      87.59         130.59          87.59   
18 2022-06-05               49     130.59          87.

In [75]:
# Count the number of unique (startingAirport, destinationAirport, flightDate) groups
num_flights = df_lags.groupby(['startingAirport', 'destinationAirport', 'flightDate']).ngroups

print(f"Number of unique flights in the dataset: {num_flights}")


Number of unique flights in the dataset: 5290
