# Modeling

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Ensure plots are displayed within the notebook
%matplotlib inline

In [13]:
# Load the dataset
df = pd.read_csv("data/data_cleaned.csv")

In [14]:
df.head()

Unnamed: 0,AIRLINE,DAY_OF_WEEK,MONTH,DEST,ORIGIN,DEP_HOUR,DISTANCE,AIR_TIME,ELAPSED_TIME,MONTHLY_DELAY_INDICATOR,ROUTE_DELAY_INDICATOR,ARR_DELAY
0,United Air Lines Inc.,2,1,EWR,FLL,11,1065.0,153.0,176.0,-1.0,0.263236,-14.0
1,Spirit Air Lines,6,3,IAH,DEN,15,862.0,110.0,139.0,-0.877541,0.232799,-13.0
2,American Airlines Inc.,4,6,DFW,SLC,6,989.0,127.0,151.0,1.0,0.148767,-11.0
3,American Airlines Inc.,5,11,ORD,DFW,18,801.0,106.0,131.0,-0.814371,0.187652,-9.0
4,Delta Air Lines Inc.,5,6,ATL,DTW,13,594.0,100.0,116.0,1.0,-0.023691,-3.0


In [15]:
df = df.sample(frac=0.01, random_state=42)
print(df.size)

53136


In [16]:
# Separate features and target variable
X = df.drop(columns=['ARR_DELAY'])  # Features
y = df['ARR_DELAY']  # Target variable

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [18]:
# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=1)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [19]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Absolute Error: 19.61242663656885
R-squared: 0.07474442336936693


In [20]:
# Step 1: Calculate the mean of the target variable
mean_arr_delay = df['ARR_DELAY'].mean()

# Step 2: Create predictions based on the mean
df['PREDICTION_SIMPLE_AVG'] = mean_arr_delay

# Step 3: Calculate evaluation metrics
mae_simple_avg = mean_absolute_error(df['ARR_DELAY'], df['PREDICTION_SIMPLE_AVG'])
r2_simple_avg = r2_score(df['ARR_DELAY'], df['PREDICTION_SIMPLE_AVG'])

# Output the results
print(f"Mean Arrival Delay (Simple Average): {mean_arr_delay:.2f}")
print(f"Mean Absolute Error of Simple Average Model: {mae_simple_avg:.2f}")
print(f"R-squared of Simple Average Model: {r2_simple_avg:.2f}")


Mean Arrival Delay (Simple Average): 1.11
Mean Absolute Error of Simple Average Model: 20.20
R-squared of Simple Average Model: 0.00
