In [3]:
import os
import joblib
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Load and clean data
df_flight = pd.read_csv("data/Flight_Price.csv").dropna()
df_flight.drop_duplicates(inplace=True)

# Feature engineering
df_flight['Journey_day'] = pd.to_datetime(df_flight['Date_of_Journey'], dayfirst=True).dt.day
df_flight['Journey_month'] = pd.to_datetime(df_flight['Date_of_Journey'], dayfirst=True).dt.month
df_flight['Dep_Time_hour'] = pd.to_datetime(df_flight['Dep_Time'], format='%H:%M').dt.hour
df_flight['Dep_Time_minute'] = pd.to_datetime(df_flight['Dep_Time'], format='%H:%M').dt.minute
df_flight['Arrival_Time_hour'] = pd.to_datetime(df_flight['Arrival_Time'], format='%H:%M', errors='coerce').dt.hour
df_flight['Arrival_Time_minute'] = pd.to_datetime(df_flight['Arrival_Time'], format='%H:%M', errors='coerce').dt.minute

def duration_to_minutes(duration):
    h, m = 0, 0
    parts = duration.strip().split()
    for part in parts:
        if 'h' in part:
            h = int(part.replace('h', ''))
        elif 'm' in part:
            m = int(part.replace('m', ''))
    return h * 60 + m

df_flight['Duration_mins'] = df_flight['Duration'].apply(duration_to_minutes)

# Label encoding
le = LabelEncoder()
for col in ['Airline', 'Source', 'Destination', 'Route', 'Total_Stops', 'Additional_Info']:
    df_flight[col] = le.fit_transform(df_flight[col])

# Features and target
Xr = df_flight[['Airline', 'Source', 'Destination', 'Route', 'Total_Stops',
                'Journey_day', 'Journey_month', 'Dep_Time_hour', 'Dep_Time_minute',
                'Arrival_Time_hour', 'Arrival_Time_minute', 'Duration_mins']]
yr = df_flight['Price']

# Train-test split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.2, random_state=42)
Xr_train.dropna(inplace=True)
yr_train = yr_train[Xr_train.index]
Xr_test.dropna(inplace=True)
yr_test = yr_test[Xr_test.index]

# Models
regressors = {
    "linear_regression": LinearRegression(),
    "random_forest": RandomForestRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "xgboost": XGBRegressor(),
    "knn_regressor": KNeighborsRegressor()
}

# Save models (only compressed)
model_dir = Path("flight_price_prediction/models")
model_dir.mkdir(parents=True, exist_ok=True)

for name, model in regressors.items():
    print(f"\n🔧 Training and saving: {name} ...")
    model.fit(Xr_train, yr_train)

    model_path = model_dir / f"{name}_compressed.pkl"
    joblib.dump(model, model_path, compress=("xz", 3))

    size_mb = model_path.stat().st_size / (1024 * 1024)
    print(f"✅ Saved {model_path.name} - Size: {size_mb:.2f} MB")
    if size_mb > 25:
        print(f"⚠️ Warning: {model_path.name} exceeds 25MB")



🔧 Training and saving: linear_regression ...
✅ Saved linear_regression_compressed.pkl - Size: 0.00 MB

🔧 Training and saving: random_forest ...
✅ Saved random_forest_compressed.pkl - Size: 3.82 MB

🔧 Training and saving: gradient_boosting ...
✅ Saved gradient_boosting_compressed.pkl - Size: 0.05 MB

🔧 Training and saving: xgboost ...
✅ Saved xgboost_compressed.pkl - Size: 0.09 MB

🔧 Training and saving: knn_regressor ...
✅ Saved knn_regressor_compressed.pkl - Size: 0.06 MB
