In [3]:
# flight_price_model_training.py
import os
import joblib
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor  # Ensure xgboost==1.7.6

# üì• Load and clean data
df = pd.read_csv("data/Flight_Price.csv").dropna().drop_duplicates()

# üß† Feature engineering
df['Journey_day'] = pd.to_datetime(df['Date_of_Journey'], dayfirst=True).dt.day
df['Journey_month'] = pd.to_datetime(df['Date_of_Journey'], dayfirst=True).dt.month
df['Dep_Time_hour'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.hour
df['Dep_Time_minute'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.minute
df['Arrival_Time_hour'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M', errors='coerce').dt.hour
df['Arrival_Time_minute'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M', errors='coerce').dt.minute

def duration_to_minutes(duration):
    h, m = 0, 0
    for part in duration.strip().split():
        if 'h' in part:
            h = int(part.replace('h', ''))
        elif 'm' in part:
            m = int(part.replace('m', ''))
    return h * 60 + m

df['Duration_mins'] = df['Duration'].apply(duration_to_minutes)

# üî§ Encode categorical features
le = LabelEncoder()
for col in ['Airline', 'Source', 'Destination', 'Route', 'Total_Stops', 'Additional_Info']:
    df[col] = le.fit_transform(df[col])

# üß™ Define features and target
features = ['Airline', 'Source', 'Destination', 'Route', 'Total_Stops',
            'Journey_day', 'Journey_month', 'Dep_Time_hour', 'Dep_Time_minute',
            'Arrival_Time_hour', 'Arrival_Time_minute', 'Duration_mins']
X = df[features]
y = df['Price']

# ‚úÇÔ∏è Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# üßº Drop NaNs from training data safely
train_data = pd.concat([X_train, y_train], axis=1).dropna()
X_train = train_data[features]
y_train = train_data['Price']

# ü§ñ Models
regressors = {
    "linear_regression": LinearRegression(),
    "random_forest": RandomForestRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "xgboost": XGBRegressor(),
    "knn_regressor": KNeighborsRegressor()
}

# üíæ Save all models
model_dir = Path("flight_price_prediction/models")
model_dir.mkdir(parents=True, exist_ok=True)

for name, model in regressors.items():
    print(f"\nüîß Training and saving: {name} ...")
    model.fit(X_train, y_train)

    model_path = model_dir / f"{name}_compressed.pkl"
    joblib.dump(model, model_path, compress=("xz", 3))

    size_mb = model_path.stat().st_size / (1024 * 1024)
    print(f"‚úÖ Saved {model_path.name} - Size: {size_mb:.2f} MB")
    if size_mb > 25:
        print(f"‚ö†Ô∏è Warning: {model_path.name} exceeds 25MB")



üîß Training and saving: linear_regression ...
‚úÖ Saved linear_regression_compressed.pkl - Size: 0.00 MB

üîß Training and saving: random_forest ...
‚úÖ Saved random_forest_compressed.pkl - Size: 3.82 MB

üîß Training and saving: gradient_boosting ...
‚úÖ Saved gradient_boosting_compressed.pkl - Size: 0.05 MB

üîß Training and saving: xgboost ...
‚úÖ Saved xgboost_compressed.pkl - Size: 0.12 MB

üîß Training and saving: knn_regressor ...
‚úÖ Saved knn_regressor_compressed.pkl - Size: 0.06 MB
