# Multiple Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
# Load the cleaned dataset
df = pd.read_csv("data/data_cleaned.csv")
# Sample 1% of the dataset
df = df.sample(frac=0.01, random_state=42)

In [3]:
# Step 1: Separate the DataFrame into two subsets
df_below_50 = df[df['ARR_DELAY'] < 50]
df_above_50 = df[df['ARR_DELAY'] >= 50]

In [4]:
# Step 2: Define a function to train a Random Forest model and evaluate it
def train_and_evaluate_model(data, target_column):
    X = data.drop(columns=[target_column])
    y = data[target_column]
    # One-hot encode categorical variables
    X = pd.get_dummies(X, drop_first=True)
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return r2, mae

In [5]:
# Step 3: Train and evaluate the model for arrival delays below 50 minutes
print("Model evaluation for ARR_DELAY < 50:")
r2_below_50, mae_below_50 = train_and_evaluate_model(df_below_50, 'ARR_DELAY')
print(f"R²: {r2_below_50:.2f}, MAE: {mae_below_50:.2f}")

# Step 4: Train and evaluate the model for arrival delays above or equal to 50 minutes
print("Model evaluation for ARR_DELAY >= 50:")
r2_above_50, mae_above_50 = train_and_evaluate_model(df_above_50, 'ARR_DELAY')
print(f"R²: {r2_above_50:.2f}, MAE: {mae_above_50:.2f}")


Model evaluation for ARR_DELAY < 50:
R²: 0.12, MAE: 11.67
Model evaluation for ARR_DELAY >= 50:
R²: -0.19, MAE: 33.34
