In [67]:
# ==========================================
# Investigation A — Revisited (Linear Regression with Feature Engineering)
# ==========================================

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --------------------------------------------------
# 1. Load datasets
# --------------------------------------------------
df1 = pd.read_csv("datasets/dataset1.csv")
df2 = pd.read_csv("datasets/dataset2.csv")

# Optional: merge datasets if a key exists (you can edit this line if you know the key)
# For now, we use df1 for Investigation A
#df = pd.concat([df1, df2], ignore_index=True)
df = df1.copy()

# Output directory
OUT = Path("outputs")
OUT.mkdir(exist_ok=True)

# --------------------------------------------------
# 2. Select Response Variable
# --------------------------------------------------
# Choose the main behaviour-related continuous variable
response = "bat_landing_to_food"
assert response in df.columns, f"{response} not found in dataset."

# --------------------------------------------------
# 3. Feature Engineering
# --------------------------------------------------
# Drop rows with missing response
df = df.dropna(subset=[response]).copy()

df['rat_present'] = np.nan
if 'seconds_after_rat_arrival' in df1.columns:
    df1.loc[df1['seconds_after_rat_arrival'].notna(), 'rat_present'] = (df1.loc[df1['seconds_after_rat_arrival'].notna(), 'seconds_after_rat_arrival'] >= 0).astype(int)
if set(['start_time','rat_period_start','rat_period_end']).issubset(df1.columns):
    mask = df1['rat_present'].isna() & df1['start_time'].notna() & df1['rat_period_start'].notna() & df1['rat_period_end'].notna()
    df1.loc[mask, 'rat_present'] = ((df1.loc[mask,'start_time'] >= df1.loc[mask,'rat_period_start']) & (df1.loc[mask,'start_time'] <= df1.loc[mask,'rat_period_end'])).astype(int)
df['rat_present'] = df1['rat_present'].fillna(0).astype(int)

# Basic variable transformations
if 'hours_after_sunset' in df.columns:
    df['hours_after_sunset_sq'] = df['hours_after_sunset']**2

if 'seconds_after_rat_arrival' in df.columns and 'hours_after_sunset' in df.columns:
    df['interaction_sec_x_hours'] = df['seconds_after_rat_arrival'] * df['hours_after_sunset']
df['risk_reward_interaction'] = df['risk'] * df['reward']


df['efficiency_ratio'] = df['reward'] / (df['bat_landing_to_food'] + 1)
df['log_seconds_after_rat_arrival'] = np.log1p(df['seconds_after_rat_arrival'])
# Convert categorical columns


df['hours_after_sunset_sq'] = df['hours_after_sunset'] ** 2
df['log_seconds'] = np.log1p(df['seconds_after_rat_arrival'])
df['interaction_rat_risk'] = df['rat_present'] * df['risk']
df['interaction_time_risk'] = df['hours_after_sunset'] * df['risk']
df['rat_time_effect'] = df['rat_present'] * df['seconds_after_rat_arrival']
df['reward_to_risk_ratio'] = df['reward'] / (df['risk'] + 1)
df['nocturnal_index'] = np.sin((df['hours_after_sunset'] / 24) * 2 * np.pi)


# Log-transform skewed numeric variables
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    if (df[col] > 0).all():  # log1p safe only for positive
        if abs(df[col].skew()) > 1:
            df[f'{col}_log1p'] = np.log1p(df[col])

# Polynomial features (non-linear transformation)
poly_features = []
if 'hours_after_sunset' in df.columns:
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_df = pd.DataFrame(poly.fit_transform(df[['hours_after_sunset']]),
                           columns=['hours_after_sunset', 'hours_after_sunset_sq'])
    df = pd.concat([df, poly_df], axis=1)
    poly_features = poly_df.columns.tolist()

# --------------------------------------------------
# 4. Predictor Selection
# --------------------------------------------------
predictors = ["log_seconds_after_rat_arrival","hours_after_sunset_sq","risk_reward_interaction", "efficiency_ratio", 'interaction_rat_risk','reward_to_risk_ratio','nocturnal_index'  ]
print("Initial numeric predictors:", predictors)

# --------------------------------------------------
# 5. Multicollinearity Check and Reduction (VIF)
# --------------------------------------------------
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Initial model predictors
X = df[predictors].select_dtypes(include=[np.number]).copy().fillna(0)
y = df[response]

# Remove high-VIF features (>10)
vif = calculate_vif(X)
while vif["VIF"].max() > 10:
    drop_col = vif.sort_values("VIF", ascending=False)["feature"].iloc[0]
    print(f"Dropping '{drop_col}' due to high VIF ({vif['VIF'].max():.2f})")
    X = X.drop(columns=[drop_col])
    vif = calculate_vif(X)

print("\nFinal predictors after VIF filtering:")
print(list(X.columns))

# --------------------------------------------------
# 6. Fit Linear Regression (OLS)
# --------------------------------------------------
X_const = sm.add_constant(X)
ols_model = sm.OLS(y, X_const).fit()
print(ols_model.summary())

# Save model summary
with open(OUT / "InvestigationA_OLS_summary.txt", "w") as f:
    f.write(ols_model.summary().as_text())

# --------------------------------------------------
# 7. Diagnostics
# --------------------------------------------------
residuals = ols_model.resid
fitted = ols_model.fittedvalues

# Residuals vs Fitted
plt.figure(figsize=(6,4))
sns.scatterplot(x=fitted, y=residuals, alpha=0.6)
plt.axhline(0, color='r', linestyle='--')
plt.title("Investigation A: Residuals vs Fitted")
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.tight_layout()
plt.savefig(OUT / "A_resid_vs_fitted.png")
plt.close()

# QQ plot
sm.qqplot(residuals, line='45')
plt.title("Investigation A: QQ Plot")
plt.tight_layout()
plt.savefig(OUT / "A_qq.png")
plt.close()

# --------------------------------------------------
# 8. Train-Test Evaluation
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)


print(r2)
print("\nEvaluation Metrics:")
print(f"Test R²:   {r2:.3f}")
print(f"Test RMSE: {rmse:.3f}")
print(f"Test MAE:  {mae:.3f}")

# Cross-validation (5-fold)
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='r2')
print(f"Mean CV R²: {cv_scores.mean():.3f} (± {cv_scores.std():.3f})")

# --------------------------------------------------
# 9. Save Cleaned Data & Outputs
# --------------------------------------------------
df.to_csv(OUT / "InvestigationA_cleaned.csv", index=False)
vif.to_csv(OUT / "InvestigationA_VIF.csv", index=False)
print("\nAll outputs saved in:", OUT.resolve())


Initial numeric predictors: ['log_seconds_after_rat_arrival', 'hours_after_sunset_sq', 'risk_reward_interaction', 'efficiency_ratio', 'interaction_rat_risk', 'reward_to_risk_ratio', 'nocturnal_index']
Dropping 'hours_after_sunset_sq' due to high VIF (inf)
Dropping 'log_seconds_after_rat_arrival' due to high VIF (10.58)

Final predictors after VIF filtering:
['risk_reward_interaction', 'efficiency_ratio', 'interaction_rat_risk', 'reward_to_risk_ratio', 'nocturnal_index']
                             OLS Regression Results                            
Dep. Variable:     bat_landing_to_food   R-squared:                       0.153
Model:                             OLS   Adj. R-squared:                  0.149
Method:                  Least Squares   F-statistic:                     32.62
Date:                 Wed, 15 Oct 2025   Prob (F-statistic):           1.23e-30
Time:                         02:14:17   Log-Likelihood:                -4221.7
No. Observations:                  907   AIC:

  vif = 1. / (1. - r_squared_i)


Mean CV R²: 0.080 (± 0.163)

All outputs saved in: /Users/maze/Documents/FODS/outputs


In [None]:
# Investigation A: Enhanced Linear Regression Model

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

# --- Feature Engineering ---

dfA = df1.copy()

# Log-transform (avoid zero/negatives)
dfA['log_seconds_after_rat_arrival'] = np.log1p(dfA['seconds_after_rat_arrival'])

# Standard scaling for continuous predictors
for col in ['hours_after_sunset', 'temperature', 'humidity']:
    if col in dfA.columns:
        dfA[f'{col}_scaled'] = (dfA[col] - dfA[col].mean()) / dfA[col].std()

# Interaction terms
dfA['interaction_risk_reward'] = dfA['risk'] * dfA['reward']
dfA['rat_risk_interaction'] = dfA['rat_present'] * dfA['risk']

# Non-linear temporal feature
dfA['night_intensity'] = np.exp(-abs(dfA['hours_after_sunset'] - 3))

# Define predictors (final refined set)
predictorsA = [
    'rat_present', 'risk', 'reward', 'hours_after_sunset_scaled',
    'log_seconds_after_rat_arrival', 'temperature_scaled', 'humidity_scaled',
    'interaction_risk_reward', 'rat_risk_interaction', 'night_intensity'
]

# Drop missing
dfA = dfA.dropna(subset=[response] + predictorsA)

# --- Remove multicollinearity ---
X = dfA[predictorsA].assign(Intercept=1)
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["feature"] = X.columns
print("VIF Table:\n", vif)

# Drop variables with VIF > 10 if needed
predictorsA = [v for v in predictorsA if v not in vif.loc[vif["VIF"] > 10, "feature"].tolist()]

# --- Model Formula ---
formulaA = response + ' ~ ' + ' + '.join(predictorsA)
print('Final Formula A:', formulaA)

# --- Fit Model ---
modelA = smf.ols(formulaA, data=dfA).fit()

# --- Evaluation Metrics ---
from sklearn.metrics import mean_squared_error, r2_score

y_true = dfA[response]
y_pred = modelA.fittedvalues
r2 = r2_score(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(modelA.summary())
print(f"R²: {r2:.4f} | RMSE: {rmse:.4f}")

# --- Diagnostics ---
resid = modelA.resid
fitted = modelA.fittedvalues

plt.figure(figsize=(6,3))
plt.scatter(fitted, resid, alpha=0.5)
plt.axhline(0, color='r', linestyle='--')
plt.title('A: Residuals vs Fitted')
plt.tight_layout()
plt.savefig(OUT/'A_resid_vs_fitted.png')
plt.close()

sm.qqplot(resid, line='45')
plt.title('A: QQ Plot (Normality)')
plt.tight_layout()
plt.savefig(OUT/'A_qq.png')
plt.close()

sns.histplot(resid, kde=True)
plt.title("A: Residual Distribution")
plt.tight_layout()
plt.savefig(OUT/'A_resid_hist.png')
plt.close()

print('Enhanced Investigation A completed and plots saved.')


KeyError: ['temperature_scaled', 'humidity_scaled']

Initial numeric predictors (safe): ['rat_present', 'hours_after_sunset', 'hours_after_sunset_sq', 'risk', 'reward', 'risk_reward_interaction', 'interaction_rat_risk', 'interaction_time_risk', 'reward_to_risk_ratio', 'nocturnal_index', 'month_log1p', 'season_log1p', 'risk_reward_interaction_log1p', 'hours_after_sunset_poly']
Dropping constant columns (no variance): ['rat_present']
Dropping 'hours_after_sunset' due to high VIF (inf)
Dropping 'risk' due to high VIF (inf)
Dropping 'reward' due to high VIF (inf)
Dropping 'risk_reward_interaction' due to high VIF (inf)
Dropping 'hours_after_sunset_poly' due to high VIF (14263.50)
Dropping 'month_log1p' due to high VIF (43.05)
Dropping 'interaction_time_risk' due to high VIF (10.42)

Final predictors after VIF filtering:
['hours_after_sunset_sq', 'interaction_rat_risk', 'reward_to_risk_ratio', 'nocturnal_index', 'season_log1p', 'risk_reward_interaction_log1p']
                             OLS Regression Results                            
Dep

  st = pd.to_datetime(df['start_time'], errors='coerce')
  r0 = pd.to_datetime(df['rat_period_start'], errors='coerce')
  r1 = pd.to_datetime(df['rat_period_end'], errors='coerce')
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


In [76]:
# ================================================================
# Investigation A (Revisited) — Polished & Optimised Linear Regression
# - Winsorize outliers
# - Gaussian (Yeo-Johnson) transform
# - Remove multicollinearity (VIF)
# - Backward feature selection (p-value)
# - Compare OLS, RidgeCV, LassoCV
# - Diagnostics & outputs saved to ./outputs
# ================================================================
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -------------------------
# Utilities
# -------------------------
def safe_load(paths):
    for p in paths:
        if Path(p).exists():
            print(f"Loading {p}")
            return pd.read_csv(p)
    raise FileNotFoundError(f"None of the candidate paths exist: {paths}")

def winsorize_series(s, lower_q=0.01, upper_q=0.99):
    lo = s.quantile(lower_q)
    hi = s.quantile(upper_q)
    return s.clip(lo, hi)

def calculate_vif(df):
    X = sm.add_constant(df)
    vif = pd.DataFrame({
        "feature": df.columns,
        "VIF": [variance_inflation_factor(X.values, i + 1) for i in range(len(df.columns))]
    }).sort_values("VIF", ascending=False)
    return vif

def backward_elimination(X, y, p_thresh=0.05, verbose=True):
    X_sm = sm.add_constant(X)
    model = sm.OLS(y, X_sm).fit()
    while True:
        pvals = model.pvalues.drop('const')
        max_p = pvals.max()
        if max_p > p_thresh:
            drop_feat = pvals.idxmax()
            if verbose:
                print(f"Dropping {drop_feat} (p={max_p:.4f})")
            X = X.drop(columns=[drop_feat])
            X_sm = sm.add_constant(X)
            model = sm.OLS(y, X_sm).fit()
        else:
            break
    return X, model

# -------------------------
# Paths & output directory
# -------------------------
CANDIDATE_PATHS = [
    "dataset1.csv", "Dataset1.csv",
    "datasets/dataset1.csv", "/mnt/data/dataset1.csv", "data/dataset1.csv"
]
OUT = Path("outputs")
OUT.mkdir(exist_ok=True)

# -------------------------
# Load data (df1 is main for Investigation A)
# -------------------------
df1 = safe_load(CANDIDATE_PATHS)

# -------------------------
# Basic cleaning & selection
# -------------------------
# Standardise column names
df1.columns = df1.columns.str.strip().str.lower()

# Check response candidate(s)
# Choose response according to your instruction: continuous variable approximating bats' behaviour
response = "bat_landing_to_food"
if response not in df1.columns:
    raise KeyError(f"Response variable '{response}' not found in dataset columns: {df1.columns.tolist()}")

# Keep only rows with valid response
df = df1.copy()
df = df.dropna(subset=[response]).reset_index(drop=True)

# Candidate predictor list (choose meaningful ones — do NOT include every column blindly)
# Start with continuous / ratio variables that are present in the file:
candidate_continuous = [
    "seconds_after_rat_arrival",
    "hours_after_sunset",
    "rat_minutes",
    "rat_arrival_number",
    "bat_landing_number",
    "food_availability"
]
# Add engineered continuous candidates if the raw columns exist
engineered_candidates = [
    "rat_presence_intensity",    # we'll compute if needed
    "foraging_efficiency"        # we'll compute if needed
]

# Build list of predictors that actually exist in df (and are numeric-capable)
predictors = []
for c in candidate_continuous:
    if c in df.columns:
        predictors.append(c)

# create engineered predictors if possible
if "seconds_after_rat_arrival" in df.columns:
    df["rat_presence_intensity"] = 1 / (1 + df["seconds_after_rat_arrival"].astype(float).replace(0, np.nan))
    df["rat_presence_intensity"] = df["rat_presence_intensity"].fillna(0)
    predictors.append("rat_presence_intensity")

# foraging_efficiency: reward / (landing_time + 1) if reward exists
if ("reward" in df.columns):
    df["foraging_efficiency"] = df["reward"].astype(float) / (df[response].astype(float) + 1)
    predictors.append("foraging_efficiency")

# Include binary/binary-like predictors that are meaningful
# e.g., risk, rat presence (if provided)
if "risk" in df.columns:
    # ensure numeric
    df["risk"] = pd.to_numeric(df["risk"], errors='coerce').fillna(0)
    predictors.append("risk")
if "rat_present" in df.columns:
    # convert to numeric
    df["rat_present"] = pd.to_numeric(df["rat_present"], errors='coerce').fillna(0)
    predictors.append("rat_present")
elif "seconds_after_rat_arrival" in df.columns:
    # derive rat_present from seconds_after_rat_arrival >= 0
    df["rat_present"] = (~df["seconds_after_rat_arrival"].isna()) & (df["seconds_after_rat_arrival"] >= 0)
    df["rat_present"] = df["rat_present"].astype(int)
    if "rat_present" not in predictors:
        predictors.append("rat_present")

# Remove duplicates and ensure predictors exist
predictors = [p for p in predictors if p in df.columns]
print("Predictor candidates used:", predictors)

# Drop any predictors that are constant or non-numeric after coercion
clean_preds = []
for p in predictors:
    df[p] = pd.to_numeric(df[p], errors="coerce")
    if df[p].nunique(dropna=True) > 1:
        clean_preds.append(p)
    else:
        print(f"Removing constant or invalid predictor: {p}")
predictors = clean_preds

# Final dataset for modeling
model_df = df[[response] + predictors].copy()
# Fill numeric NaNs by median (safer than mean)
for col in model_df.columns:
    if model_df[col].isna().any():
        model_df[col] = model_df[col].fillna(model_df[col].median())

# -------------------------
# Winsorize outliers at 1% / 99%
# -------------------------
for col in model_df.select_dtypes(include=[np.number]).columns:
    model_df[col] = winsorize_series(model_df[col], 0.01, 0.99)

# -------------------------
# Log / Yeo-Johnson (Gaussianize) transform for skewed predictors
# -------------------------
# Use Yeo-Johnson (works with zero/negative)
pt = PowerTransformer(method="yeo-johnson", standardize=False)
X_raw = model_df[predictors].astype(float)
# detect skewness and transform only if skew > 0.75 (heuristic)
skews = X_raw.skew().abs()
to_transform = skews[skews > 0.75].index.tolist()
print("Features selected for Yeo-Johnson transform (skew>0.75):", to_transform)
if to_transform:
    X_trans = X_raw.copy()
    X_trans[to_transform] = pt.fit_transform(X_trans[to_transform])
else:
    X_trans = X_raw.copy()

# -------------------------
# Remove multicollinearity via iterative VIF removal (threshold 5.0)
# -------------------------
X_vif = X_trans.copy()
def iterative_vif_removal(X_df, thresh=5.0):
    X_work = X_df.copy()
    while True:
        vif = calculate_vif(X_work)
        max_vif = vif["VIF"].max()
        if max_vif > thresh:
            drop_feat = vif.sort_values("VIF", ascending=False)["feature"].iloc[0]
            print(f"Dropping {drop_feat} (VIF={max_vif:.2f})")
            X_work = X_work.drop(columns=[drop_feat])
        else:
            break
    return X_work

X_nmv = iterative_vif_removal(X_vif, thresh=5.0)
print("Predictors after VIF reduction:", list(X_nmv.columns))

# -------------------------
# Standardize predictors
# -------------------------
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_nmv), columns=X_nmv.columns, index=X_nmv.index)

# -------------------------
# Optionally add polynomial / interaction terms (only a few to avoid explosion)
# We'll add pairwise interactions for a small subset (top 4 features by variance)
# -------------------------
# choose up to 4 most variable features
vars_by_var = X_scaled.var().sort_values(ascending=False).index.tolist()
interaction_feats = vars_by_var[:4]
print("Creating interactions among:", interaction_feats)
for i in range(len(interaction_feats)):
    for j in range(i+1, len(interaction_feats)):
        a = interaction_feats[i]
        b = interaction_feats[j]
        name = f"{a}_x_{b}"
        X_scaled[name] = X_scaled[a] * X_scaled[b]

# -------------------------
# Backward feature selection by p-value on OLS (start with current X_scaled)
# -------------------------
X_for_model = X_scaled.copy()
y_final = model_df[response].astype(float)

X_selected, ols_full = backward_elimination(X_for_model, y_final, p_thresh=0.05, verbose=True)
print("Final features after backward elimination:", list(X_selected.columns))

# -------------------------
# Train/Test split
# -------------------------
X_tr, X_te, y_tr, y_te = train_test_split(X_selected, y_final, test_size=0.2, random_state=42)

# -------------------------
# Fit OLS (final) and evaluate
# -------------------------
X_tr_sm = sm.add_constant(X_tr)
final_ols = sm.OLS(y_tr, X_tr_sm).fit()
print("\nFinal OLS summary:")
print(final_ols.summary())

X_te_sm = sm.add_constant(X_te)
y_pred_ols = final_ols.predict(X_te_sm)

r2_ols = r2_score(y_te, y_pred_ols)
rmse_ols = np.sqrt(mean_squared_error(y_te, y_pred_ols))
mae_ols = mean_absolute_error(y_te, y_pred_ols)
print(f"\nFinal OLS Test metrics: R²={r2_ols:.4f}, RMSE={rmse_ols:.4f}, MAE={mae_ols:.4f}")

# -------------------------
# Regularized models (RidgeCV, LassoCV) for comparison & stability
# -------------------------
ridge = RidgeCV(alphas=np.logspace(-4, 4, 50), cv=5).fit(X_tr, y_tr)
y_pred_ridge = ridge.predict(X_te)
r2_ridge = r2_score(y_te, y_pred_ridge); rmse_ridge = np.sqrt(mean_squared_error(y_te, y_pred_ridge))

lasso = LassoCV(alphas=np.logspace(-4, 2, 50), cv=5, max_iter=10000).fit(X_tr, y_tr)
y_pred_lasso = lasso.predict(X_te)
r2_lasso = r2_score(y_te, y_pred_lasso); rmse_lasso = np.sqrt(mean_squared_error(y_te, y_pred_lasso))

print(f"Ridge Test metrics: R²={r2_ridge:.4f}, RMSE={rmse_ridge:.4f}, alpha={ridge.alpha_}")
print(f"Lasso Test metrics: R²={r2_lasso:.4f}, RMSE={rmse_lasso:.4f}, alpha={lasso.alpha_}")

# -------------------------
# Cross-validation for final OLS (10-fold)
# -------------------------
cv_scores = cross_val_score(LinearRegression(), X_selected, y_final, cv=10, scoring="r2")
print(f"10-fold CV R²: mean={cv_scores.mean():.4f}, std={cv_scores.std():.4f}")

# -------------------------
# Diagnostic plots: residuals vs fitted and QQ and Actual vs Predicted
# -------------------------
OUT.mkdir(exist_ok=True)

# OLS diagnostics on test set
residuals = y_te - y_pred_ols
fitted = y_pred_ols

plt.figure(figsize=(7,4))
sns.scatterplot(x=fitted, y=residuals, alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted (Final OLS)")
plt.tight_layout()
plt.savefig(OUT / "A_resid_vs_fitted_two.png")
plt.close()

# QQ plot
plt.figure(figsize=(6,6))
sm.qqplot(residuals, line="45", fit=True)
plt.title("QQ plot (Final OLS residuals)")
plt.tight_layout()
plt.savefig(OUT / "A_qq_two_final.png")
plt.close()

# Actual vs predicted
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_te, y=y_pred_ols, alpha=0.6)
plt.plot([y_te.min(), y_te.max()], [y_te.min(), y_te.max()], color="red", linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted (Final OLS)")
plt.tight_layout()
plt.savefig(OUT / "A_actual_vs_predicted_final.png")
plt.close()

# -------------------------
# Save outputs & summary
# -------------------------
model_summary = final_ols.summary().as_text()
with open(OUT / "InvestigationA_final_OLS_summary.txt", "w") as f:
    f.write(model_summary)

results = {
    "final_ols_test_r2": float(r2_ols),
    "final_ols_rmse": float(rmse_ols),
    "ridge_test_r2": float(r2_ridge),
    "ridge_rmse": float(rmse_ridge),
    "lasso_test_r2": float(r2_lasso),
    "lasso_rmse": float(rmse_lasso),
    "cv_r2_mean": float(cv_scores.mean()),
    "cv_r2_std": float(cv_scores.std()),
    "num_obs": int(model_df.shape[0]),
    "final_predictors": list(X_selected.columns)
}

print(results)

pd.DataFrame([results]).to_csv(OUT / "InvestigationA_results_summary.csv", index=False)
model_df.to_csv(OUT / "InvestigationA_model_data.csv", index=False)

print("\nOutputs saved to folder:", OUT.resolve())
print("Final model performance summary:")
print(pd.DataFrame([results]).T)


Loading datasets/dataset1.csv
Predictor candidates used: ['seconds_after_rat_arrival', 'hours_after_sunset', 'rat_presence_intensity', 'foraging_efficiency', 'risk', 'rat_present']
Removing constant or invalid predictor: rat_present
Features selected for Yeo-Johnson transform (skew>0.75): ['seconds_after_rat_arrival', 'rat_presence_intensity', 'foraging_efficiency']
Dropping rat_presence_intensity (VIF=5.97)
Predictors after VIF reduction: ['seconds_after_rat_arrival', 'hours_after_sunset', 'foraging_efficiency', 'risk']
Creating interactions among: ['hours_after_sunset', 'foraging_efficiency', 'seconds_after_rat_arrival', 'risk']
Dropping foraging_efficiency_x_seconds_after_rat_arrival (p=0.9064)
Dropping risk (p=0.5934)
Dropping hours_after_sunset (p=0.4216)
Dropping seconds_after_rat_arrival (p=0.3859)
Dropping hours_after_sunset_x_foraging_efficiency (p=0.3344)
Dropping hours_after_sunset_x_risk (p=0.5334)
Dropping hours_after_sunset_x_seconds_after_rat_arrival (p=0.2702)
Dropping 

<Figure size 600x600 with 0 Axes>