# Lasso Regression Model

## Depedencies + Loading in Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split


In [2]:
# Load in pre-cleaned data set (see the Data Cleaning notebook)
data = pd.read_csv("/work/filtered_data.csv")
data.head()

Unnamed: 0,startingAirport,destinationAirport,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,seatsRemaining,totalTravelDistance,segmentsArrivalAirportCode,...,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,bookingClassCode,daysTillFlight,departureDayOfWeek,departureHour,arrivalDayOfWeek,arrivalHour
0,LAX,ORD,1,False,False,True,236.28,9,1745.0,ORD,...,Boeing 737-900,14640,1745,coach,W,27,5,23,6,6
1,LAX,ORD,0,False,False,True,305.12,6,1745.0,ORD,...,Boeing 757-300,14880,1745,coach,Q,3,1,10,1,16
2,LAX,ORD,0,False,False,True,322.79,5,1745.0,ORD,...,Boeing 737-800,14700,1745,coach,M,1,1,13,1,19
3,LAX,ORD,0,False,False,True,459.53,7,1745.0,ORD,...,Boeing 737-800,14640,1745,coach,H,8,0,7,0,13
4,LAX,DFW,0,False,False,True,252.09,7,1238.0,DFW,...,Airbus A321,10860,1238,coach,L,5,5,18,5,23


## Dummy Encoding Data + Train/Test Split

In [3]:
# Dummy encoding data + train/test split
df_encoded = pd.get_dummies(data)
df_encoded = df_encoded.dropna()
df_encoded = df_encoded.astype('float64')
X = df_encoded.drop('baseFare', axis=1)
y = df_encoded['baseFare']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=142)

## Basic Lasso Model

In [4]:
# Lasso with arbitrarily chosen alpha as 0.1 => corresponds to 0.80 R2 accuracy

from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


In [5]:
# Calculating R2 of Lasso model
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

R² Score: 0.8105480182933995


## Cross-Validated Lasso Model

In [6]:
# Performing cross-validation with the Lasso Model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit LassoCV again on the scaled data
lasso_cv = LassoCV(cv=5, random_state=42).fit(X_train_scaled, y_train)
print("Optimal Alpha (scaled):", lasso_cv.alpha_)

Optimal Alpha (scaled): 0.05671289954355441


In [7]:
# Seems like CV-selected alpha achieves the same R² score as the original model with randomly choosing 0.1 as the alpha

lasso_model_CV_alpha = Lasso(alpha=0.05668193869844624)
lasso_model_CV_alpha.fit(X_train, y_train)
y_pred_CV_alpha = lasso_model_CV_alpha.predict(X_test)
r2_CV_alpha = r2_score(y_test, y_pred_CV_alpha)
print(f"R² Score: {r2_CV_alpha}")

  model = cd_fast.enet_coordinate_descent(
R² Score: 0.8143844526404496


## Performance Metrics + Analysis

In [13]:
SSE = np.sum((y_test - y_pred_CV_alpha) ** 2)

# Calculate SST (using the mean of y_train)
SST = np.sum((y_test - np.mean(y_train)) ** 2)

# Compute OSR^2
OSR2 = 1 - SSE / SST

print(f"OSR²: {OSR2}")

OSR²: 0.8143970285952638


In [14]:
rmse = np.sqrt(np.mean((y_test - y_pred_CV_alpha) ** 2))
print("RMSE:", rmse)

RMSE: 62.94605030724557


In [8]:
features = X_train.columns
coefficients = lasso_model.coef_

In [9]:
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefficients
})

# Filter dropped features (coefficients that are zero)
dropped_features = feature_importance[feature_importance['Coefficient'] == 0]
print("Dropped Features:")
print(dropped_features)

# Features retained
retained_features = feature_importance[feature_importance['Coefficient'] != 0]
print("\nRetained Features:")
print(retained_features)

Dropped Features:
                                              Feature  Coefficient
0                                         elapsedDays         -0.0
2                                        isRefundable          0.0
3                                           isNonStop          0.0
13                                startingAirport_LAX         -0.0
14                                startingAirport_SFO          0.0
15                             destinationAirport_ATL         -0.0
16                             destinationAirport_BOS         -0.0
17                             destinationAirport_CLT          0.0
18                             destinationAirport_DEN          0.0
20                             destinationAirport_DTW          0.0
21                             destinationAirport_EWR         -0.0
22                             destinationAirport_IAD          0.0
23                             destinationAirport_JFK         -0.0
24                             destinationAi