<a href="https://colab.research.google.com/github/ccspen21/greenland-fishery-nowcast-2025/blob/main/model_fitting_diagnostics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas sqlite3 scikit-learn matplotlib
import os
import pandas as pd
import sqlite3
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display

# Ensure compatibility with Colab and GitHub
!apt-get update && apt-get install -y iputils-ping

# Define a configurable database path
DB_PATH = os.getenv("DB_PATH", "greenland_fishery.db")

In [None]:
# Connect to the database
try:
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    print(f"Connected to SQLite database at {DB_PATH}")
except Exception as e:
    print(f"Error connecting to database: {e}")
    raise

# Helper function to validate DataFrame against schema
def validate_dataframe(df, expected_columns, dtypes):
    if not all(col in df.columns for col in expected_columns):
        raise ValueError(f"DataFrame missing expected columns: {expected_columns}")
    for col, dtype in dtypes.items():
        if col in df.columns:
            df[col] = df[col].astype(dtype)
    if df.isnull().any().any():
        raise ValueError(f"DataFrame contains NaN values: {df.head()}")
    if df.empty:
        raise ValueError("DataFrame is empty. Ensure setup_dataset.ipynb has populated the database correctly.")

# Load data from tables with validation
tables = {
    "total_catch": (["Year", "Quarter", "Unit", "Total_Catch"], {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}),
    "fish_exports": (["Year", "Quarter", "Fish_Export_Value_Million_Kr"], {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}),
    "sst_west": (["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"], {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}),
    "sst_east": (["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"], {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}),
    "sst_south": (["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"], {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}),
    "foreign_catch": (["Year", "Quarter", "Unit", "Foreign_Catch"], {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}),
}

dataframes = {}
for table_name, (expected_columns, dtypes) in tables.items():
    try:
        df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
        validate_dataframe(df, expected_columns, dtypes)
        dataframes[table_name] = df
        print(f"Loaded {table_name}:")
        display(df.head())
    except Exception as e:
        print(f"Error loading {table_name}: {e}")
        raise  # Stop execution to alert the user to fix the issue

# Assign to variables expected by later cells
df_clean = dataframes["total_catch"]
df_fish_clean = dataframes["fish_exports"]
df_sst_west_clean = dataframes["sst_west"]
df_sst_east_clean = dataframes["sst_east"]
df_sst_south_clean = dataframes["sst_south"]
df_foreign_clean = dataframes["foreign_catch"]

# 3-Way Interaction Term

In [None]:
# Create three-way interaction terms in each SST regional DataFrame
df_sst_west_clean["Melt_SST_Interaction_West"] = (
    df_sst_west_clean["Melt_Active_West"] *
    df_sst_west_clean["Melt_Index_West"] *
    df_sst_west_clean["Sea_Surface_Temp_C_West"]
)

df_sst_east_clean["Melt_SST_Interaction_East"] = (
    df_sst_east_clean["Melt_Active_East"] *
    df_sst_east_clean["Melt_Index_East"] *
    df_sst_east_clean["Sea_Surface_Temp_C_East"]
)

df_sst_south_clean["Melt_SST_Interaction_South"] = (
    df_sst_south_clean["Melt_Active_South"] *
    df_sst_south_clean["Melt_Index_South"] *
    df_sst_south_clean["Sea_Surface_Temp_C_South"]
)

# Merged Dataset

In [None]:
# Ensure all 'Year' and 'Quarter' columns are of consistent type across DataFrames
# Note: Quarter is already in Q1, Q2, etc. format from setup_dataset.ipynb, but we'll ensure types are consistent
df_clean["Year"] = df_clean["Year"].astype(int)
df_clean["Quarter"] = df_clean["Quarter"].astype(str)
df_fish_clean["Year"] = df_fish_clean["Year"].astype(int)
df_fish_clean["Quarter"] = df_fish_clean["Quarter"].astype(str)
df_foreign_clean["Year"] = df_foreign_clean["Year"].astype(int)
df_foreign_clean["Quarter"] = df_foreign_clean["Quarter"].astype(str)
df_sst_west_clean["Year"] = df_sst_west_clean["Year"].astype(int)
df_sst_west_clean["Quarter"] = df_sst_west_clean["Quarter"].astype(str)
df_sst_east_clean["Year"] = df_sst_east_clean["Year"].astype(int)
df_sst_east_clean["Quarter"] = df_sst_east_clean["Quarter"].astype(str)
df_sst_south_clean["Year"] = df_sst_south_clean["Year"].astype(int)
df_sst_south_clean["Quarter"] = df_sst_south_clean["Quarter"].astype(str)

# Start fresh from df_clean
df_merged_with_interactions = df_clean.copy()

# Merge standard right-hand-side variables
df_merged_with_interactions = df_merged_with_interactions.merge(df_fish_clean, on=["Year", "Quarter"], how="inner")

# Merge SST interaction terms
df_merged_with_interactions = df_merged_with_interactions.merge(
    df_sst_west_clean[["Year", "Quarter", "Melt_SST_Interaction_West"]],
    on=["Year", "Quarter"], how="inner"
).merge(
    df_sst_east_clean[["Year", "Quarter", "Melt_SST_Interaction_East"]],
    on=["Year", "Quarter"], how="inner"
).merge(
    df_sst_south_clean[["Year", "Quarter", "Melt_SST_Interaction_South"]],
    on=["Year", "Quarter"], how="inner"
)

# Merge foreign catch
df_merged_with_interactions = df_merged_with_interactions.merge(
    df_foreign_clean.drop(columns=["Unit"]),
    on=["Year", "Quarter"], how="left"
)

# Drop unnecessary columns
df_merged_with_interactions = df_merged_with_interactions.drop(
    columns=[col for col in df_merged_with_interactions.columns if "Unit" in col], errors="ignore"
)

# Log dropped rows after merging
original_len = len(df_merged_with_interactions)
df_merged_with_interactions = df_merged_with_interactions.dropna()
dropped_rows = original_len - len(df_merged_with_interactions)
if dropped_rows > 0:
    print(f"Dropped {dropped_rows} rows due to missing values after merging")

# Order
df_merged_with_interactions["Quarter"] = pd.Categorical(
    df_merged_with_interactions["Quarter"], categories=["Q1", "Q2", "Q3", "Q4"], ordered=True
)
df_merged_with_interactions = df_merged_with_interactions.sort_values(by=["Year", "Quarter"]).reset_index(drop=True)

print("✅ Final merged dataset shape:", df_merged_with_interactions.shape)
display(df_merged_with_interactions.head())

# LASSO Regression

In [None]:
# Step 1: Identify predictors to lag (exclude Total_Catch and lagged versions)
predictor_cols = [
    col for col in df_merged_with_interactions.columns
    if col not in ["Year", "Quarter", "Total_Catch"]
    and not col.endswith("_lag1")
    and not col.endswith("_lag2")
    and not col.endswith("_lag3")
    and not col.endswith("_lag4")
]

print("🔍 Predictors to lag:", predictor_cols)

# Step 2: Add lags 1–4 for each selected predictor
for col in predictor_cols:
    for lag in [1, 2, 3, 4]:
        df_merged_with_interactions[f"{col}_lag{lag}"] = df_merged_with_interactions[col].shift(lag)

# Step 3: Add lags 1–4 for Total_Catch
for lag in [1, 2, 3, 4]:
    df_merged_with_interactions[f"Total_Catch_lag{lag}"] = df_merged_with_interactions["Total_Catch"].shift(lag)

# Step 4: Drop original (non-lagged) predictors
df_model_with_interactions = df_merged_with_interactions.drop(columns=predictor_cols)

# Step 5: Drop rows with NA from lagging
df_model_with_interactions = df_model_with_interactions.dropna().reset_index(drop=True)

# Step 6: Define modeling dataset
y = df_model_with_interactions["Total_Catch"]
X = df_model_with_interactions.drop(columns=["Year", "Quarter", "Total_Catch"])

# ✅ Final shape check
print("✅ Clean setup with lags 1–4 — X shape:", X.shape, "y shape:", y.shape)

# Selected Variables

In [None]:
# Step 1: Create LASSO pipeline with standardization and cross-validated alpha
lasso_pipeline = make_pipeline(
    StandardScaler(),
    LassoCV(cv=5, random_state=42, max_iter=50000)
)

# Step 2: Fit the model
lasso_pipeline.fit(X, y)

# Step 3: Extract coefficients into a clean DataFrame
lasso_model = lasso_pipeline.named_steps["lassocv"]
coef = pd.Series(lasso_model.coef_, index=X.columns)

# Step 4: Display selected variables (non-zero)
selected = coef[coef != 0]
print("✅ Selected variables with 4 lags:\n", selected)

# Step 5: Plot all coefficients
plt.figure(figsize=(12, 6))
coef.plot(kind='barh')
plt.title("LASSO Coefficients (with lags 1–4)")
plt.axvline(0, color='gray', linestyle='--')
plt.tight_layout()
plt.show()


# **Evaluation**

# LASSO Coefficients

In [None]:
# Extract fitted model from pipeline
lasso_model = lasso_pipeline.named_steps['lassocv']

# Define variable names used (updated to match schema)
feature_names = [
    "Total_Catch_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]

# Create a DataFrame of coefficients
coef_df = pd.DataFrame({
    "LASSO": lasso_model.coef_,
}, index=feature_names)

# Round for cleaner display
coef_df = coef_df.round(0).astype(int)

# Display
display(coef_df)


# Out-of-Sample Performance - Backtests

# Q4 2024

In [None]:
# Step 1: Filter training data (up to Q3 2024 only)
train_q4 = df_model_with_interactions[
    (df_model_with_interactions["Year"] < 2024) |
    ((df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] < "Q4"))
]

X_train_q4 = train_q4[[
    "Total_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]]

y_train_q4 = train_q4["Total_Catch"]
print("✅ Training set size for Q4 2024 (Top 5 Vars):", X_train_q4.shape)

# Step 2: Create nowcast input row for Q4 2024
try:
    latest_row_q3 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] == "Q3")
    ].iloc[0]
except IndexError:
    print("Error: Q3 2024 data not found. Ensure periodic_update.ipynb has been run to fetch the latest data.")
    raise

X_nowcast_q4 = pd.DataFrame([{
    "Total_Catch_lag4": df_model_with_interactions.iloc[-5]["Total_Catch_lag4"],                          # Q4 2023
    "Melt_SST_Interaction_West_lag4": df_model_with_interactions.iloc[-5]["Melt_SST_Interaction_West_lag4"],
    "Foreign_Catch_lag4": df_model_with_interactions.iloc[-5]["Foreign_Catch_lag4"],
    "Melt_SST_Interaction_East_lag1": df_model_with_interactions.iloc[-2]["Melt_SST_Interaction_East_lag1"],
    "Fish_Export_Value_Million_Kr_lag2": df_model_with_interactions.iloc[-5]["Fish_Export_Value_Million_Kr_lag2"]
}])

print("✅ Nowcast input row for Q4 2024 (5 vars):")
display(X_nowcast_q4)

# Step 3: Fit LASSO model
lasso_pipeline_q4 = make_pipeline(
    StandardScaler(),
    Lasso(alpha=1.0, max_iter=10000)
)

lasso_pipeline_q4.fit(X_train_q4, y_train_q4)

# Step 4: Predict Q4 2024
y_pred_q4_2024_lasso = lasso_pipeline_q4.predict(X_nowcast_q4)[0]
print(f"📈 🧪 Nowcast for Q4 2024 (LASSO, Fish Export included): {round(y_pred_q4_2024_lasso):,.0f} tons")

# Step 5: Compare with actual
try:
    actual_q4 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] == "Q4")
    ]["Total_Catch"].values[0]
except IndexError:
    print("Error: Q4 2024 actual data not found. Cannot compute forecast error.")
    raise

error_q4 = y_pred_q4_2024_lasso - actual_q4
print(f"🎯 Actual Q4 2024: {round(actual_q4):,.0f} tons")
print(f"🔍 Forecast Error: {round(error_q4):,.0f} tons ({round(100 * error_q4 / actual_q4, 1)}%)")

# Q3 2024

In [None]:
# Step 1: Filter training data up to Q2 2024 (exclude Q3)
train_q3 = df_model_with_interactions[
    (df_model_with_interactions["Year"] < 2024) |
    ((df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] < "Q3"))
]

X_train_q3 = train_q3[[
    "Total_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]]
y_train_q3 = train_q3["Total_Catch"]
print("✅ Training set size for Q3 2024 (5 vars):", X_train_q3.shape)

# Step 2: Create nowcast input row for Q3 2024
try:
    latest_row_q2 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] == "Q2")
    ].iloc[0]
except IndexError:
    print("Error: Q2 2024 data not found. Ensure periodic_update.ipynb has been run to fetch the latest data.")
    raise

X_nowcast_q3 = pd.DataFrame([{
    "Total_Catch_lag4": df_model_with_interactions.iloc[-6]["Total_Catch_lag4"],                          # Q3 2023
    "Melt_SST_Interaction_West_lag4": df_model_with_interactions.iloc[-6]["Melt_SST_Interaction_West_lag4"],
    "Foreign_Catch_lag4": df_model_with_interactions.iloc[-6]["Foreign_Catch_lag4"],
    "Melt_SST_Interaction_East_lag1": df_model_with_interactions.iloc[-3]["Melt_SST_Interaction_East_lag1"],
    "Fish_Export_Value_Million_Kr_lag2": df_model_with_interactions.iloc[-6]["Fish_Export_Value_Million_Kr_lag2"]
}])

print("✅ Nowcast input row for Q3 2024 (5 vars):")
display(X_nowcast_q3)

# Step 3: Fit LASSO model
lasso_pipeline_q3 = make_pipeline(
    StandardScaler(),
    Lasso(alpha=1.0, max_iter=10000)
)

lasso_pipeline_q3.fit(X_train_q3, y_train_q3)

# Step 4: Predict Q3 2024
y_pred_q3_2024_lasso = lasso_pipeline_q3.predict(X_nowcast_q3)[0]
print(f"📈 🧪 Nowcast for Q3 2024 (LASSO, Fish Export included): {round(y_pred_q3_2024_lasso):,.0f} tons")

# Step 5: Compare with actual
try:
    actual_q3 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] == "Q3")
    ]["Total_Catch"].values[0]
except IndexError:
    print("Error: Q3 2024 actual data not found. Cannot compute forecast error.")
    raise

error_q3 = y_pred_q3_2024_lasso - actual_q3
print(f"🎯 Actual Q3 2024: {round(actual_q3):,.0f} tons")
print(f"🔍 Forecast Error: {round(error_q3):,.0f} tons ({round(100 * error_q3 / actual_q3, 1)}%)")

# Q2 2024

In [None]:
q2_vars = [
    "Total_Catch_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]

# Step 1: Training data up to Q1 2024 (exclude Q2)
train_q2 = df_model_with_interactions[
    (df_model_with_interactions["Year"] < 2024) |
    ((df_model_with_interactions["Year"] == 2024) & (df_model_with_interactions["Quarter"] < "Q2"))
]

X_train_q2 = train_q2[[
    "Total_Catch_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]]
y_train_q2 = train_q2["Total_Catch"]

print("✅ Training set size for Q2 2024 (LASSO):", X_train_q2.shape)

# Step 2: Create nowcast input row for Q2 2024
try:
    latest_row_q1 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) &
        (df_model_with_interactions["Quarter"] == "Q1")
    ].iloc[0]
except IndexError:
    print("Error: Q1 2024 data not found. Ensure periodic_update.ipynb has been run to fetch the latest data.")
    raise

X_nowcast_q2 = pd.DataFrame([{
    "Total_Catch_lag4": df_model_with_interactions.iloc[-7]["Total_Catch_lag4"],  # Q2 2023
    "Foreign_Catch_lag4": df_model_with_interactions.iloc[-7]["Foreign_Catch_lag4"],
    "Melt_SST_Interaction_West_lag4": df_model_with_interactions.iloc[-7]["Melt_SST_Interaction_West_lag4"],
    "Melt_SST_Interaction_East_lag1": df_model_with_interactions.iloc[-4]["Melt_SST_Interaction_East_lag1"],  # Q1 2024
    "Fish_Export_Value_Million_Kr_lag2": df_model_with_interactions.iloc[-7]["Fish_Export_Value_Million_Kr_lag2"]
}])

print("✅ Nowcast input for Q2 2024 (LASSO):")
display(X_nowcast_q2)

# Step 3: Fit LASSO pipeline
lasso_pipeline_q2 = make_pipeline(
    StandardScaler(),
    Lasso(alpha=1.0, max_iter=10000)
)

lasso_pipeline_q2.fit(X_train_q2, y_train_q2)
y_pred_q2_2024_lasso = lasso_pipeline_q2.predict(X_nowcast_q2)[0]

# Step 4: Compare with actual
try:
    actual_q2 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) &
        (df_model_with_interactions["Quarter"] == "Q2")
    ]["Total_Catch"].values[0]
except IndexError:
    print("Error: Q2 2024 actual data not found. Cannot compute forecast error.")
    raise

error_q2_lasso = y_pred_q2_2024_lasso - actual_q2
print(f"📊 LASSO Nowcast for Q2 2024: {round(y_pred_q2_2024_lasso):,.0f} tons")
print(f"🎯 Actual Q2 2024: {round(actual_q2):,.0f} tons")
print(f"🔍 Forecast Error: {round(error_q2_lasso):,.0f} tons ({round(100 * error_q2_lasso / actual_q2, 1)}%)")

# Q1 2024

In [None]:
# Step 1: Training data up to Q4 2023 (exclude Q1 2024)
train_q1 = df_model_with_interactions[
    (df_model_with_interactions["Year"] < 2024)
]

X_train_q1 = train_q1[[
    "Total_Catch_lag4",
    "Foreign_Catch_lag4",
    "Melt_SST_Interaction_West_lag4",
    "Melt_SST_Interaction_East_lag1",
    "Fish_Export_Value_Million_Kr_lag2"
]]
y_train_q1 = train_q1["Total_Catch"]

print("✅ Training set size for Q1 2024 (LASSO):", X_train_q1.shape)

# Step 2: Create nowcast input row for Q1 2024
try:
    latest_row_q4_2023 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2023) &
        (df_model_with_interactions["Quarter"] == "Q4")
    ].iloc[0]
except IndexError:
    print("Error: Q4 2023 data not found. Ensure setup_dataset.ipynb or periodic_update.ipynb has been run to fetch the latest data.")
    raise

X_nowcast_q1 = pd.DataFrame([{
    "Total_Catch_lag4": df_model_with_interactions.iloc[-8]["Total_Catch_lag4"],  # Q1 2023
    "Foreign_Catch_lag4": df_model_with_interactions.iloc[-8]["Foreign_Catch_lag4"],
    "Melt_SST_Interaction_West_lag4": df_model_with_interactions.iloc[-8]["Melt_SST_Interaction_West_lag4"],
    "Melt_SST_Interaction_East_lag1": df_model_with_interactions.iloc[-5]["Melt_SST_Interaction_East_lag1"],  # Q4 2023
    "Fish_Export_Value_Million_Kr_lag2": df_model_with_interactions.iloc[-8]["Fish_Export_Value_Million_Kr_lag2"]
}])

print("✅ Nowcast input for Q1 2024 (LASSO):")
display(X_nowcast_q1)

# Step 3: Fit LASSO model
lasso_pipeline_q1 = make_pipeline(
    StandardScaler(),
    Lasso(alpha=1.0, max_iter=10000)
)

lasso_pipeline_q1.fit(X_train_q1, y_train_q1)
y_pred_q1_2024_lasso = lasso_pipeline_q1.predict(X_nowcast_q1)[0]

# Step 4: Compare with actual
try:
    actual_q1 = df_model_with_interactions[
        (df_model_with_interactions["Year"] == 2024) &
        (df_model_with_interactions["Quarter"] == "Q1")
    ]["Total_Catch"].values[0]
except IndexError:
    print("Error: Q1 2024 actual data not found. Cannot compute forecast error.")
    raise

error_q1_lasso = y_pred_q1_2024_lasso - actual_q1
print(f"📊 LASSO Nowcast for Q1 2024: {round(y_pred_q1_2024_lasso):,.0f} tons")
print(f"🎯 Actual Q1 2024: {round(actual_q1):,.0f} tons")
print(f"🔍 Forecast Error: {round(error_q1_lasso):,.0f} tons ({round(100 * error_q1_lasso / actual_q1, 1)}%)")

# MAE for Backtests

In [None]:
# Q1 MAE
mae_q1_lasso = abs(y_pred_q1_2024_lasso - actual_q1)

# Q2 MAE
mae_q2_lasso = abs(y_pred_q2_2024_lasso - actual_q2)

# Q3 MAE
mae_q3_lasso = abs(y_pred_q3_2024_lasso - actual_q3)

# Q4 MAE
mae_q4_lasso = abs(y_pred_q4_2024_lasso - actual_q4)

# Average MAE across all quarters
avg_mae_lasso = (mae_q1_lasso + mae_q2_lasso + mae_q3_lasso + mae_q4_lasso) / 4

# Display the MAE comparison table
print("📊 MAE Comparison Table (Q1–Q4 2024 Backtests)")
print("------------------------------------------------")
print(f"Q1 2024 MAE - LASSO:        {mae_q1_lasso:,.0f}")
print(f"Q2 2024 MAE - LASSO:        {mae_q2_lasso:,.0f}")
print(f"Q3 2024 MAE - LASSO:        {mae_q3_lasso:,.0f}")
print(f"Q4 2024 MAE - LASSO:        {mae_q4_lasso:,.0f}")
print("------------------------------------------------")
print(f"Avg MAE     - LASSO:        {avg_mae_lasso:,.0f}")

# In-Sample Performance

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# ✅ In-sample predictions (using Q1 training data)
y_pred_lasso_train = lasso_pipeline_q1.predict(X_train_q1)

# ✅ LASSO metrics
print("🔍 LASSO In-Sample Performance:")
print("MAE:", mean_absolute_error(y_train_q1, y_pred_lasso_train))

Given the nature of fish catch in Greenland, which is heavily influenced by policy shocks, quota trading, and environmental volatility, MAE is a more appropriate metric than RMSE or R². It offers a stable, interpretable measure of predictive accuracy that reflects the average deviation in real-world units (tons), without over-penalizing policy-driven anomalies.

In [None]:
# Close the database connection
try:
    conn.close()
    print("Database connection closed.")
except Exception as e:
    print(f"Error closing database connection: {e}")