In [1]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, r2_score


In [2]:
### LOAD & PREPROCESS DATA ###
df = pd.read_parquet('../data/03_processed/cleaned_data_snappy.parquet')

In [3]:
# Apply IQR-based Winsorization (Outlier Capping)
def cap_outliers(series):
    """Apply IQR-based capping to limit extreme values."""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return np.clip(series, lower_bound, upper_bound)

In [4]:
cap_columns = ['totalFare', 'totalTravelDistance', 'pricePerMile', 'daysToDeparture', 
               'totalLayoverTime', 'totalAirtime']

In [5]:
for col in cap_columns:
    df[col] = cap_outliers(df[col])

In [6]:
# Feature Engineering: Calculate Duration-to-Distance Ratio
df['durationToDistanceRatio'] = df['totalAirtime'] / df['totalTravelDistance']
df['durationToDistanceRatio'].replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['durationToDistanceRatio'], inplace=True)

In [7]:
# Log-transform the target variable (totalFare) for better linearity
df['totalFare'] = np.log1p(df['totalFare'])

In [8]:
# Feature Selection (from proposal)
features = ['daysToDeparture', 'pricePerMile', 'isHoliday', 'preHolidayFlight', 
            'postHolidayFlight', 'totalLayoverTime', 'durationToDistanceRatio', 'cabin_classes_ordinal']


In [9]:
X = df[features]
y = df['totalFare']

In [10]:
# Normalize Features (Min-Max Scaling)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [11]:
# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [12]:
### TRAIN BASELINE LINEAR REGRESSION MODEL ###
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


In [13]:
# Predictions
y_pred = linear_model.predict(X_test)


In [14]:
# Evaluate Performance
print("Linear Regression Results:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


Linear Regression Results:
MAE: 0.22502562654014166
R² Score: 0.4166798390303552


In [15]:
### APPLY RIDGE & LASSO TO HANDLE MULTICOLLINEARITY ###
ridge_model = Ridge(alpha=1.0)  # Regularization strength (can be tuned)
lasso_model = Lasso(alpha=0.01)  # Lasso for feature selection


In [16]:
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)


In [17]:
# Predictions for Ridge & Lasso
y_pred_ridge = ridge_model.predict(X_test)
y_pred_lasso = lasso_model.predict(X_test)


In [18]:
# Evaluate Ridge Regression
print("\nRidge Regression Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_ridge))
print("R² Score:", r2_score(y_test, y_pred_ridge))



Ridge Regression Results:
MAE: 0.22502553860898475
R² Score: 0.416679126886786


In [19]:
# Evaluate Lasso Regression
print("\nLasso Regression Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_lasso))
print("R² Score:", r2_score(y_test, y_pred_lasso))



Lasso Regression Results:
MAE: 0.2927642322882081
R² Score: 0.11580328993732059


In [20]:
### CROSS-VALIDATION TO COMPARE MODELS ###
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [21]:
ridge_cv_scores = cross_val_score(ridge_model, X_scaled, y, cv=kf, scoring='neg_mean_absolute_error')
lasso_cv_scores = cross_val_score(lasso_model, X_scaled, y, cv=kf, scoring='neg_mean_absolute_error')

print("\nCross-Validation Results (MAE):")
print(f"Ridge: {-ridge_cv_scores.mean()}")
print(f"Lasso: {-lasso_cv_scores.mean()}")



Cross-Validation Results (MAE):
Ridge: 0.22509429348596335
Lasso: 0.2925385412438609


In [22]:
from sklearn.model_selection import GridSearchCV

ridge_params = {'alpha': [0.1, 0.5, 1, 5, 10, 50]}
ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_absolute_error')
ridge_grid.fit(X_train, y_train)

best_ridge = ridge_grid.best_estimator_
print("Best Ridge Alpha:", ridge_grid.best_params_)

# Evaluate with best alpha
y_pred_ridge_best = best_ridge.predict(X_test)
print("Tuned Ridge Regression Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_ridge_best))
print("R² Score:", r2_score(y_test, y_pred_ridge_best))


Best Ridge Alpha: {'alpha': 10}
Tuned Ridge Regression Results:
MAE: 0.2250251009603355
R² Score: 0.4166714929617633


In [23]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
ridge_poly = make_pipeline(poly, Ridge(alpha=1.0))

ridge_poly.fit(X_train, y_train)
y_pred_poly = ridge_poly.predict(X_test)

print("Ridge (Polynomial Features) Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_poly))
print("R² Score:", r2_score(y_test, y_pred_poly))


Ridge (Polynomial Features) Results:
MAE: 0.21689291361614324
R² Score: 0.4801224169935113


In [24]:
from sklearn.feature_selection import RFE

ridge = Ridge(alpha=1.0)
rfe = RFE(ridge, n_features_to_select=5)  # Keep top 5 features
rfe.fit(X_train, y_train)

selected_features = np.array(features)[rfe.support_]
print("Selected Features:", selected_features)

# Train Ridge model with selected features only
X_train_selected = X_train[:, rfe.support_]
X_test_selected = X_test[:, rfe.support_]

ridge_selected = Ridge(alpha=1.0)
ridge_selected.fit(X_train_selected, y_train)

y_pred_selected = ridge_selected.predict(X_test_selected)
print("Ridge (Selected Features) Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_selected))
print("R² Score:", r2_score(y_test, y_pred_selected))


Selected Features: ['pricePerMile' 'postHolidayFlight' 'totalLayoverTime'
 'durationToDistanceRatio' 'cabin_classes_ordinal']
Ridge (Selected Features) Results:
MAE: 0.2254298223931492
R² Score: 0.4149892316445053


In [None]:
# Save the trained Linear Regression model in the 'models' folder

models_dir = "models"
os.makedirs(models_dir, exist_ok=True)  # Ensure the directory exists

model_filename = os.path.join(models_dir, "linear_regression.pkl")
joblib.dump(ridge_selected, model_filename)
print(f"Model saved as {model_filename}")


In [None]:
# Define test data directory for Linear Regression
lr_test_data_dir = "test_data/LinearRegression"
os.makedirs(lr_test_data_dir, exist_ok=True)  # Ensure directory exists

# Save test data for Linear Regression
joblib.dump(X_test_selected, os.path.join(lr_test_data_dir, "X_test_lr.pkl"))
joblib.dump(y_test, os.path.join(lr_test_data_dir, "y_test_lr.pkl"))
print("Test data saved in 'test_data/LinearRegression/' for Linear Regression.")