In [1]:
#Import libraries
import pandas as pd           ##Pandas/Numpy:- Data manipulation
import numpy as np              

In [None]:
from sklearn.model_selection import TimeSeriesSplit      ##TimeSeriesSplit:- time aware splitting
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor                         ##Strong for non- linear pricing patterns

In [None]:
df = pd.read_csv("flight_prices.csv")

In [None]:
# Convert timestamp to datetime (required for time-based features)
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["departure_date"] = pd.to_datetime(df["departure_date"])

#Logic:- Time series models break if rows are out of order
## Sorting ensures correct lag & rolling features

# Always sort by entity + time
df = df.sort_values(["route_id", "timestamp"]).reset_index(drop=True)

In [None]:
#Create basic time features

In [None]:
df["hour"] = df["timestamp"].dt.hour
df["day_of_week"] = df["timestamp"].dt.dayofweek
df["week_of_year"] = df["timestamp"].dt.isocalendar().week.astype(int)

# Days left before travel (VERY important in flight pricing)
df["days_to_departure"] = (
    df["departure_date"] - df["timestamp"]
).dt.days

#Why? Airlines behave differently by time and booking horizon
## Model learn sesonality + urgency pricing

In [None]:
#Create lag features (Past Memory)

In [None]:
# Lag features capture past price behavior
lags = [1, 2, 6, 12, 24]

for lag in lags:
    df[f"price_lag_{lag}"] = (
        df.groupby("route_id")["price"].shift(lag)
    )

##Supervised models cannot remember past values
### Lags convert time series to tabular supervised format


In [None]:
# Rolling windows (in hours if data is hourly)
windows = [6, 12, 24]

for w in windows:
    df[f"rolling_mean_{w}"] = (
        df.groupby("route_id")["price"]
        .shift(1)
        .rolling(window=w)
        .mean()
    )

    df[f"rolling_std_{w}"] = (
        df.groupby("route_id")["price"]
        .shift(1)
        .rolling(window=w)
        .std()
    )

# shift(1) ???
##Prevents data leakage
###You must not use current or future price to predict future volatility

In [None]:
### Create price change dynamics
df["pct_change_1"] = (
    df.groupby("route_id")["price"].pct_change()
)

df["price_change_flag"] = (
    df["pct_change_1"].abs() > 0
).astype(int)

df["price_change_count_6h"] = (
    df.groupby("route_id")["price_change_flag"]
    .shift(1)
    .rolling(6)
    .sum()
)


### Volatility is more about movement frequency than absolute price
# This captures pricing aggressive

In [None]:
#Create target variable (Future volatility)
# Target = future 24h volatility
df["target_vol_24h"] = (
    df.groupby("route_id")["price"]
    .shift(-1)
    .rolling(24)
    .std()
)

#Logic: Shift backwards to future outcome
# Rolling std to volatility measure



In [None]:
#Drop invalid rows(Required)
df_model = df.dropna().reset_index(drop=True)

##Rows with Nans cannot be used

In [None]:
### Train- Test split (Time Aware)

# Sort again just to be safe
df_model = df_model.sort_values("timestamp")

# Define features
features = [
    col for col in df_model.columns
    if col not in ["timestamp", "route_id", "target_vol_24h", "departure_date"]
]

X = df_model[features]
y = df_model["target_vol_24h"]

# Split by time (last 20% as test)
split_idx = int(len(df_model) * 0.8)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

#Why no random split
## Future data must never influence past predictions

In [None]:
#Train XGBoost model

In [None]:
model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

### XGBost handles: 1. Non linearity 2. Future Interactions  3. Sparse signals (Common in pricing)

In [None]:
#Evaluate Model

preds = model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
mae = mean_absolute_error(y_test, preds)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
