In [None]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from catboost import CatBoostRegressor, Pool
import pyarrow.parquet as pq
import warnings
warnings.filterwarnings('ignore')

  import pynvml


In [3]:
fs = s3fs.S3FileSystem()

print("Reading with Polars...")
with fs.open('s3://airlines-sic-v2/cleaned_data/flight_data_processed.parquet', 'rb') as f:
    df = pl.read_parquet(f)
print("Done")

Reading with Polars...
Done


In [4]:
bool_cols = ['isBasicEconomy', 'isRefundable']
for col in bool_cols:
    df = df.with_columns(pl.col(col).cast(pl.Int8))

df_pandas = df.to_pandas()

str_cols = ['startingAirport', 'destinationAirport', 'passenger_type', 'carrier_category']
encoders = {}

for col in str_cols:
    print(f"   -> Encoding {col}...")
    le = LabelEncoder()
    df_pandas[col] = le.fit_transform(df_pandas[col].astype(str))
    encoders[col] = le

print(df_pandas.dtypes)

X = df_pandas.drop(columns=['totalFare'])
y = df_pandas['totalFare']

print("Splitting data")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

   -> Encoding startingAirport...
   -> Encoding destinationAirport...
   -> Encoding passenger_type...
   -> Encoding carrier_category...
startingAirport             int64
destinationAirport          int64
elapsedDays                 int64
isBasicEconomy               int8
isRefundable                 int8
totalFare                 float64
seatsRemaining              int64
totalTravelDistance       float64
num_stops                  uint32
arrival_hour                 int8
departure_hour               int8
advance_purchase_days       int32
passenger_type              int64
cabin_category_encoded      int32
seasonality_encoded         int32
travelDuration_minutes      int32
total_layover_minutes     float64
carrier_category            int64
days_until_flight           int16
flight_weekday               int8
flight_month                 int8
is_weekend                   int8
dtype: object
Splitting data


In [5]:
print("\n1️Training XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
    early_stopping_rounds=50
)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
p1 = xgb_model.predict(X_test)
print(f"   -> XGB Score: {r2_score(y_test, p1)*100:.2f}%")


1️Training XGBoost...
   -> XGB Score: 91.23%


In [6]:
print("\nEvaluating Model...")
predictions = xgb_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"--------------------------------")
print(f"R2 Score (Accuracy): {r2*100:.2f}%")
print(f"MAE (Average Error): ${mae:.2f}")
print(f"RMSE: ${rmse:.2f}")
print(f"--------------------------------")


Evaluating Model...
--------------------------------
R2 Score (Accuracy): 91.23%
MAE (Average Error): $34.61
RMSE: $51.72
--------------------------------


In [16]:
print("\n2️Training LightGBM...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.1,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    n_jobs=-1,
    random_state=42
)
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='l2',
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)] 
)
p2 = lgb_model.predict(X_test)
print(f"   -> LGBM Score: {r2_score(y_test, p2)*100:.2f}%")


2️Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.213118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 6798140, number of used features: 21
[LightGBM] [Info] Start training from score 342.177716
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 3243.27
   -> LGBM Score: 89.36%


In [17]:
print("\nEvaluating Model...")
predictions = lgb_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"--------------------------------")
print(f"R2 Score (Accuracy): {r2*100:.2f}%")
print(f"MAE (Average Error): ${mae:.2f}")
print(f"RMSE: ${rmse:.2f}")
print(f"--------------------------------")


Evaluating Model...
--------------------------------
R2 Score (Accuracy): 89.36%
MAE (Average Error): $39.32
RMSE: $56.95
--------------------------------


In [18]:
print("\n3️Training CatBoost...")
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.1,
    depth=10,
    loss_function='RMSE',
    random_seed=42,
    verbose=100,
    allow_writing_files=False
)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)
p3 = cat_model.predict(X_test)
print(f"   -> CatBoost Score: {r2_score(y_test, p3)*100:.2f}%")


3️Training CatBoost...
0:	learn: 164.1344840	test: 163.9631629	best: 163.9631629 (0)	total: 539ms	remaining: 17m 57s
100:	learn: 80.2591952	test: 80.1968162	best: 80.1968162 (100)	total: 53.5s	remaining: 16m 45s
200:	learn: 73.9791559	test: 73.9247119	best: 73.9247119 (200)	total: 1m 46s	remaining: 15m 49s
300:	learn: 70.3908112	test: 70.3766163	best: 70.3766163 (300)	total: 2m 38s	remaining: 14m 55s
400:	learn: 68.1158894	test: 68.1415982	best: 68.1415982 (400)	total: 3m 31s	remaining: 14m 3s
500:	learn: 66.3448625	test: 66.4133187	best: 66.4133187 (500)	total: 4m 25s	remaining: 13m 12s
600:	learn: 64.9498103	test: 65.0567611	best: 65.0567611 (600)	total: 5m 19s	remaining: 12m 23s
700:	learn: 63.8001864	test: 63.9446031	best: 63.9446031 (700)	total: 6m 13s	remaining: 11m 31s
800:	learn: 62.7874715	test: 62.9658227	best: 62.9658227 (800)	total: 7m 7s	remaining: 10m 40s
900:	learn: 61.9950009	test: 62.2061104	best: 62.2061104 (900)	total: 8m	remaining: 9m 46s
1000:	learn: 61.2952231	te

In [19]:
print("\nEvaluating Model...")
predictions = cat_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"--------------------------------")
print(f"R2 Score (Accuracy): {r2*100:.2f}%")
print(f"MAE (Average Error): ${mae:.2f}")
print(f"RMSE: ${rmse:.2f}")
print(f"--------------------------------")


Evaluating Model...
--------------------------------
R2 Score (Accuracy): 89.21%
MAE (Average Error): $39.29
RMSE: $57.36
--------------------------------


In [27]:
w1, w2, w3 = 0.6, 0.25, 0.15

final_preds = (p1 * w1) + (p2 * w2) + (p3 * w3)

ensemble_r2 = r2_score(y_test, final_preds)
ensemble_mae = mean_absolute_error(y_test, final_preds)

print(f"---------------------------------------")
print(f"Ensemble R2 Score: {ensemble_r2*100:.2f}%")
print(f"Ensemble MAE:      ${ensemble_mae:.2f}")

---------------------------------------
Ensemble R2 Score: 90.79%
Ensemble MAE:      $35.70
