In [1]:
# 00_data_understanding_eda.ipynb
# Purpose: Deep EDA for WQD (virtual sensor training & label source) and Monteria (time series for forecasting).
# Output:
#   - /POSEIDON/data/interim/wqd_clean.csv
#   - /POSEIDON/data/interim/mon_clean.csv
#   - /POSEIDON/reports/eda_summary.md
#   - /POSEIDON/images/*.png (plots)

from pathlib import Path
import os, json, math, textwrap, warnings
warnings.filterwarnings("ignore")

cwd = Path.cwd().resolve()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Consistent plot defaults
plt.rcParams["figure.figsize"] = (7, 4)
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.dpi"] = 120

# Project roots (adjust if your root differs)
#ROOT = Path(./POSEIDON)
ROOT = cwd.parent if cwd.name.lower() == "notebooks" else next((p for p in [cwd] + list(cwd.parents) if p.name.lower() == "poseidon"), cwd)
DATA = ROOT / "data"
INTERIM = DATA / "interim"
REPORTS = ROOT / "reports"
IMAGES = ROOT / "images"

for d in (DATA, INTERIM, REPORTS, IMAGES):
    d.mkdir(parents=True, exist_ok=True)

wqd_path = DATA / "WQD.xlsx"
mon_path = DATA / "Monteria_Aquaculture_Data.xlsx"

print("ROOT:", ROOT)
print("WQD path exists:", wqd_path.exists())
print("Monteria path exists:", mon_path.exists())


ROOT: C:\Users\PC\Documents\Machine_Learning\Capstone_Project\Poseidon
WQD path exists: True
Monteria path exists: True


In [2]:
# Load raw files (no schema changes yet)
wqd_raw = pd.read_excel(wqd_path)
mon_raw = pd.read_excel(mon_path)

print("WQD raw shape:", wqd_raw.shape)
print("Monteria raw shape:", mon_raw.shape)

print("\nWQD columns:", list(wqd_raw.columns))
print("Monteria columns:", list(mon_raw.columns))

display(wqd_raw.head(5))
display(mon_raw.head(5))


WQD raw shape: (4300, 15)
Monteria raw shape: (4345, 7)

WQD columns: ['Temp', 'Turbidity (cm)', 'DO(mg/L)', 'BOD (mg/L)', 'CO2', 'pH`', 'Alkalinity (mg L-1 )', 'Hardness (mg L-1 )', 'Calcium (mg L-1 )', 'Ammonia (mg L-1 )', 'Nitrite (mg L-1 )', 'Phosphorus (mg L-1 )', 'H2S (mg L-1 )', 'Plankton (No. L-1)', 'Water Quality']
Monteria columns: ['DateTime', 'Temperature', 'Dissolved_Oxygen', 'pH', 'Turbidity', 'Date', 'Hour']


Unnamed: 0,Temp,Turbidity (cm),DO(mg/L),BOD (mg/L),CO2,pH`,Alkalinity (mg L-1 ),Hardness (mg L-1 ),Calcium (mg L-1 ),Ammonia (mg L-1 ),Nitrite (mg L-1 ),Phosphorus (mg L-1 ),H2S (mg L-1 ),Plankton (No. L-1),Water Quality
0,67.448725,10.127148,0.208153,7.473607,10.181084,4.751657,218.364855,300.12508,337.178226,0.286054,4.35531,0.005984,0.066793,6069.624017,2
1,64.626666,94.015595,11.434463,10.859998,14.860521,3.085154,273.939692,8.426776,363.66074,0.09604,2.182753,0.004906,0.023428,250.995959,2
2,65.121842,90.653462,12.430865,12.80997,12.31998,9.648515,220.81273,11.726274,309.370934,0.974501,4.90176,0.006979,0.065041,7218.927473,2
3,1.640334,0.066344,10.963529,8.508023,12.955209,4.819988,266.571628,6.627655,8.180468,0.884865,3.571842,3.174473,0.026018,1230.062252,2
4,64.863434,2.119173,1.361736,13.335372,13.603197,10.244034,252.108,339.891514,253.996871,0.801695,4.655898,3.854701,0.060995,1035.05482,2


Unnamed: 0,DateTime,Temperature,Dissolved_Oxygen,pH,Turbidity,Date,Hour
0,2024-01-01 00:00:00,27.598028,6.916388,7.937212,3.677379,2024-01-01,0
1,2024-01-01 01:00:00,27.217041,6.871578,7.872365,4.110987,2024-01-01,1
2,2024-01-01 02:00:00,27.688613,5.950694,7.941378,3.039369,2024-01-01,2
3,2024-01-01 03:00:00,28.213818,6.335761,7.804893,3.611809,2024-01-01,3
4,2024-01-01 04:00:00,27.159508,6.563543,7.822742,3.885563,2024-01-01,4


In [3]:
# We standardize to a canonical schema used across the whole project
# WQD: Temp, Turbidity (cm Secchi), DO(mg/L), pH`, Ammonia (mg L-1 ), Water Quality
# Monteria: DateTime, Temperature, Dissolved_Oxygen, pH, Turbidity (NTU), Date, Hour

def canonicalize_wqd(df: pd.DataFrame) -> pd.DataFrame:
    rename = {
        "Temp": "temperature",
        "Turbidity (cm)": "turbidity_cm",     # Secchi depth (cm)
        "DO(mg/L)": "do",
        "pH`": "pH",
        "Ammonia (mg L-1 )": "ammonia",
        "Water Quality": "water_quality"
    }
    out = df.rename(columns=rename).copy()
    keep = [c for c in ["temperature", "turbidity_cm", "do", "pH", "ammonia", "water_quality"] if c in out.columns]
    out = out[keep]

    # Make sure numeric types are numeric
    for c in ["temperature", "turbidity_cm", "do", "pH", "ammonia", "water_quality"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")

    # Convert Secchi depth (cm) to a turbidity-like proxy: higher cm = clearer -> invert
    if "turbidity_cm" in out.columns:
        out["turbidity_proxy"] = 1.0 / out["turbidity_cm"].clip(lower=1e-3)

    return out

def canonicalize_mon(df: pd.DataFrame) -> pd.DataFrame:
    rename = {
        "DateTime": "timestamp",
        "Temperature": "temperature",
        "Dissolved_Oxygen": "do",
        "pH": "pH",
        "Turbidity": "turbidity"
    }
    out = df.rename(columns=rename).copy()
    if "timestamp" in out.columns:
        out["timestamp"] = pd.to_datetime(out["timestamp"], errors="coerce")

    for c in ["temperature", "pH", "turbidity", "do"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")

    return out

wqd = canonicalize_wqd(wqd_raw)
mon = canonicalize_mon(mon_raw)

print("WQD canonical shape:", wqd.shape)
print("Monteria canonical shape:", mon.shape)
display(wqd.head(3))
display(mon.head(3))


WQD canonical shape: (4300, 7)
Monteria canonical shape: (4345, 7)


Unnamed: 0,temperature,turbidity_cm,do,pH,ammonia,water_quality,turbidity_proxy
0,67.448725,10.127148,0.208153,4.751657,0.286054,2,0.098744
1,64.626666,94.015595,11.434463,3.085154,0.09604,2,0.010637
2,65.121842,90.653462,12.430865,9.648515,0.974501,2,0.011031


Unnamed: 0,timestamp,temperature,do,pH,turbidity,Date,Hour
0,2024-01-01 00:00:00,27.598028,6.916388,7.937212,3.677379,2024-01-01,0
1,2024-01-01 01:00:00,27.217041,6.871578,7.872365,4.110987,2024-01-01,1
2,2024-01-01 02:00:00,27.688613,5.950694,7.941378,3.039369,2024-01-01,2


Cell 4 - Missingness Overview

In [4]:
def missing_report(df: pd.DataFrame, name: str) -> pd.DataFrame:
    rep = pd.DataFrame({
        "dataset": name,
        "column": df.columns,
        "dtype": [df[c].dtype for c in df.columns],
        "n_missing": df.isna().sum().values,
        "pct_missing": (df.isna().mean() * 100).values
    })
    return rep

wqd_missing = missing_report(wqd, "WQD")
mon_missing = missing_report(mon, "Monteria")

display(wqd_missing)
display(mon_missing)


Unnamed: 0,dataset,column,dtype,n_missing,pct_missing
0,WQD,temperature,float64,0,0.0
1,WQD,turbidity_cm,float64,0,0.0
2,WQD,do,float64,0,0.0
3,WQD,pH,float64,0,0.0
4,WQD,ammonia,float64,0,0.0
5,WQD,water_quality,int64,0,0.0
6,WQD,turbidity_proxy,float64,0,0.0


Unnamed: 0,dataset,column,dtype,n_missing,pct_missing
0,Monteria,timestamp,datetime64[ns],0,0.0
1,Monteria,temperature,float64,0,0.0
2,Monteria,do,float64,0,0.0
3,Monteria,pH,float64,0,0.0
4,Monteria,turbidity,float64,0,0.0
5,Monteria,Date,datetime64[ns],0,0.0
6,Monteria,Hour,int64,0,0.0


Cell 5 — Basic descriptive statistics & quantiles

In [5]:
pd.options.display.float_format = "{:,.4f}".format
wqd_desc = wqd.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T
mon_desc = mon.describe(percentiles=[.01,.05,.25,.5,.75,.95,.99]).T

print("WQD describe:")
display(wqd_desc)
print("Monteria describe:")
display(mon_desc)


WQD describe:


Unnamed: 0,count,mean,std,min,1%,5%,25%,50%,75%,95%,99%,max
temperature,4300.0,25.6957,9.6702,0.194,4.5456,15.6496,19.7759,25.0418,30.2777,34.46,73.8074,84.2515
turbidity_cm,4300.0,39.0467,20.9427,0.0514,6.712,15.8343,22.2241,30.2057,55.9457,76.918,92.6659,99.7977
do,4300.0,5.3003,1.8327,0.1339,1.2308,3.1212,3.9784,5.0008,6.5212,7.8013,12.1925,14.9701
pH,4300.0,7.7135,1.5803,0.0039,2.3107,6.0566,6.443,7.7432,9.0353,9.444,12.1789,14.8512
ammonia,4300.0,0.0483,0.1229,0.0,0.0006,0.0029,0.0127,0.0263,0.039,0.0498,0.7961,0.9993
water_quality,4300.0,1.0233,0.821,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0
turbidity_proxy,4300.0,0.0551,0.5454,0.01,0.0108,0.013,0.0179,0.0331,0.045,0.0632,0.149,19.4464


Monteria describe:


Unnamed: 0,count,mean,min,1%,5%,25%,50%,75%,95%,99%,max,std
timestamp,4345.0,2024-03-31 12:00:00,2024-01-01 00:00:00,2024-01-02 19:26:24,2024-01-10 01:12:00,2024-02-15 06:00:00,2024-03-31 12:00:00,2024-05-15 18:00:00,2024-06-20 22:48:00,2024-06-28 04:33:36,2024-06-30 00:00:00,
temperature,4345.0,27.3060,25.3552,25.9141,26.3355,26.9077,27.3076,27.6998,28.3001,28.6772,29.6557,0.5987
do,4345.0,6.8920,4.5980,5.4747,5.8742,6.4783,6.8900,7.3128,7.8981,8.3685,9.0174,0.6141
pH,4345.0,7.8277,7.0063,7.3404,7.4924,7.6915,7.8275,7.9644,8.1633,8.2961,8.5393,0.2051
turbidity,4345.0,3.3238,1.7469,2.2800,2.6247,3.0294,3.3246,3.6180,4.0357,4.3346,5.2360,0.434
Date,4345.0,2024-03-31 00:30:09.528193280,2024-01-01 00:00:00,2024-01-02 00:00:00,2024-01-10 00:00:00,2024-02-15 00:00:00,2024-03-31 00:00:00,2024-05-15 00:00:00,2024-06-20 00:00:00,2024-06-28 00:00:00,2024-06-30 00:00:00,
Hour,4345.0,11.4974,0.0000,0.0000,1.0000,5.0000,11.0000,17.0000,22.0000,23.0000,23.0000,6.9244


Cell 6 — Domain plausible ranges + QC flags

In [6]:
# Broad ranges (refined later). We don't drop here; we only flag.
ranges = {
    "temperature": (0, 40),   # °C
    "pH": (5.0, 9.5),         # pond comfort envelope
    "do": (0, 15),            # mg/L
    "ammonia": (0, 1.5)       # mg/L
}

def qc_flag(df: pd.DataFrame, name: str) -> pd.DataFrame:
    out = df.copy()
    for col, (lo, hi) in ranges.items():
        if col in out.columns:
            out[f"{col}_out_of_range"] = ~out[col].between(lo, hi)
    out["__dataset__"] = name
    return out

wqd_qc = qc_flag(wqd, "WQD")
mon_qc = qc_flag(mon, "Monteria")

# Summaries
def range_issue_summary(qcdf: pd.DataFrame) -> pd.DataFrame:
    cols = [c for c in qcdf.columns if c.endswith("_out_of_range")]
    rows = []
    for c in cols:
        pct = qcdf[c].mean() * 100
        rows.append({"flag": c, "pct_out_of_range": pct})
    return pd.DataFrame(rows).sort_values("pct_out_of_range", ascending=False)

print("WQD range flags (%):")
display(range_issue_summary(wqd_qc))
print("Monteria range flags (%):")
display(range_issue_summary(mon_qc))


WQD range flags (%):


Unnamed: 0,flag,pct_out_of_range
1,pH_out_of_range,4.4186
0,temperature_out_of_range,2.3256
2,do_out_of_range,0.0
3,ammonia_out_of_range,0.0


Monteria range flags (%):


Unnamed: 0,flag,pct_out_of_range
0,temperature_out_of_range,0.0
1,pH_out_of_range,0.0
2,do_out_of_range,0.0


Cell 7 — Histograms (WQD)

In [7]:
# Save histograms for WQD (temperature, pH, turbidity_proxy, do, ammonia)
cols = [c for c in ["temperature","pH","turbidity_proxy","do","ammonia"] if c in wqd.columns]
for c in cols:
    plt.figure()
    wqd[c].dropna().hist(bins=40)
    plt.title(f"WQD Distribution: {c}")
    plt.xlabel(c); plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(IMAGES / f"wqd_hist_{c}.png")
    plt.close()
len(cols), "histograms saved in /images/"


(5, 'histograms saved in /images/')

Cell 8 — Histograms (Monteria)

In [8]:
cols = [c for c in ["temperature","pH","turbidity","do"] if c in mon.columns]
for c in cols:
    plt.figure()
    mon[c].dropna().hist(bins=40)
    plt.title(f"Monteria Distribution: {c}")
    plt.xlabel(c); plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(IMAGES / f"mon_hist_{c}.png")
    plt.close()
len(cols), "histograms saved in /images/"


(4, 'histograms saved in /images/')

Cell 9 — Correlations (Spearman)

In [9]:
def spearman_corr_plot(df: pd.DataFrame, cols, title, filename):
    sub = df[cols].dropna()
    if sub.shape[0] < 5:
        print("Not enough rows for correlation plot:", title)
        return None
    corr = sub.corr(method="spearman")
    plt.figure(figsize=(6,5))
    im = plt.imshow(corr, interpolation="nearest")
    plt.title(title)
    plt.xticks(range(len(cols)), cols, rotation=45, ha="right")
    plt.yticks(range(len(cols)), cols)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.tight_layout()
    plt.savefig(IMAGES / filename)
    plt.close()
    return corr

wqd_corr = spearman_corr_plot(
    wqd, [c for c in ["temperature","pH","turbidity_proxy","do","ammonia"] if c in wqd.columns],
    title="WQD Spearman Correlation", filename="wqd_corr.png"
)

mon_corr = spearman_corr_plot(
    mon, [c for c in ["temperature","pH","turbidity","do"] if c in mon.columns],
    title="Monteria Spearman Correlation", filename="mon_corr.png"
)

print("WQD corr matrix:")
display(wqd_corr)
print("Monteria corr matrix:")
display(mon_corr)


WQD corr matrix:


Unnamed: 0,temperature,pH,turbidity_proxy,do,ammonia
temperature,1.0,0.0044,0.0195,-0.006,-0.0226
pH,0.0044,1.0,-0.0176,-0.0191,-0.0393
turbidity_proxy,0.0195,-0.0176,1.0,0.484,0.5086
do,-0.006,-0.0191,0.484,1.0,0.5115
ammonia,-0.0226,-0.0393,0.5086,0.5115,1.0


Monteria corr matrix:


Unnamed: 0,temperature,pH,turbidity,do
temperature,1.0,-0.0202,0.0073,0.0099
pH,-0.0202,1.0,0.0095,-0.0216
turbidity,0.0073,0.0095,1.0,0.0057
do,0.0099,-0.0216,0.0057,1.0


Cell 10 — Diurnal analysis (Monteria)

In [10]:
if "timestamp" in mon.columns:
    mon_time = mon.dropna(subset=["timestamp"]).copy()
    mon_time["hour"] = mon_time["timestamp"].dt.hour
    diurnal_means = mon_time.groupby("hour")[["temperature","pH","turbidity","do"]].mean()
    diurnal_stds = mon_time.groupby("hour")[["temperature","pH","turbidity","do"]].std()

    # Save as CSV for reports
    diurnal_means.to_csv(REPORTS / "monteria_diurnal_means.csv")
    diurnal_stds.to_csv(REPORTS / "monteria_diurnal_stds.csv")

    # Quick plots
    for c in ["temperature","pH","turbidity","do"]:
        if c in mon_time.columns:
            plt.figure()
            diurnal_means[c].plot(marker="o")
            plt.title(f"Monteria Diurnal Mean by Hour: {c}")
            plt.xlabel("hour"); plt.ylabel(c)
            plt.tight_layout()
            plt.savefig(IMAGES / f"mon_diurnal_{c}.png")
            plt.close()

    display(diurnal_means.head(10))
else:
    print("No timestamp column in Monteria; skipping diurnal analysis.")


Unnamed: 0_level_0,temperature,pH,turbidity,do
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,27.2996,7.8213,3.3269,6.8309
1,27.2775,7.8325,3.3081,6.9273
2,27.2668,7.8349,3.3246,6.8786
3,27.2904,7.842,3.3409,6.8154
4,27.352,7.8195,3.339,6.919
5,27.3632,7.8346,3.3155,6.7934
6,27.3315,7.8286,3.3583,6.9755
7,27.268,7.8301,3.3497,6.8938
8,27.2983,7.8174,3.3055,6.8679
9,27.2766,7.8179,3.3614,6.8609


Cell 11 — Simple lag autocorrelation (Monteria)

In [11]:
def lag_autocorr(series: pd.Series, lag=1):
    try:
        return series.dropna().autocorr(lag=lag)
    except Exception:
        return np.nan

acf = {}
for col in ["temperature","pH","turbidity","do"]:
    if col in mon.columns:
        acf[col] = lag_autocorr(mon[col], lag=1)

print("Monteria lag-1 autocorrelation (naive):")
display(pd.DataFrame({"metric":"lag1_autocorr", **acf}, index=[0]).T)


Monteria lag-1 autocorrelation (naive):


Unnamed: 0,0
metric,lag1_autocorr
temperature,-0.0133
pH,0.0021
turbidity,-0.0026
do,-0.0209


Cell 12 — Robust outlier analysis (MAD-based)

In [12]:
def outlier_table(df: pd.DataFrame, cols, z=3.5):
    rows = []
    for c in cols:
        if c not in df.columns:
            continue
        x = df[c].dropna()
        if len(x) == 0:
            rows.append({"column":c, "median":np.nan, "mad":np.nan, f"pct_outlier_|z|>{z}":np.nan})
            continue
        med = x.median()
        mad = (x - med).abs().median() + 1e-9
        robust_z = (x - med) / (1.4826 * mad)
        pct = (robust_z.abs() > z).mean() * 100
        rows.append({"column":c, "median":float(med), "mad":float(mad), f"pct_outlier_|z|>{z}":float(pct)})
    return pd.DataFrame(rows)

wqd_out = outlier_table(wqd, ["temperature","pH","turbidity_proxy","do","ammonia"])
mon_out = outlier_table(mon, ["temperature","pH","turbidity","do"])

print("WQD robust outliers:")
display(wqd_out)
print("Monteria robust outliers:")
display(mon_out)


WQD robust outliers:


Unnamed: 0,column,median,mad,pct_outlier_|z|>3.5
0,temperature,25.0418,5.2471,2.3256
1,pH,7.7432,1.2976,0.6977
2,turbidity_proxy,0.0331,0.0141,1.2558
3,do,5.0008,1.23,1.2558
4,ammonia,0.0263,0.0131,4.4651


Monteria robust outliers:


Unnamed: 0,column,median,mad,pct_outlier_|z|>3.5
0,temperature,27.3076,0.3976,0.046
1,pH,7.8275,0.1365,0.046
2,turbidity,3.3246,0.2937,0.046
3,do,6.89,0.4199,0.046


Cell 13 — Interim clean datasets (CSV)

In [13]:
# We do NOT do heavy cleaning here; only rows that lack essential columns are removed.
# Full cleaning rules will be finalized in the next notebook based on this EDA.

wqd_clean = wqd.dropna(subset=["temperature","pH","turbidity_proxy","do","ammonia"]).copy()
mon_clean = mon.dropna(subset=["timestamp","temperature","pH","turbidity"]).copy()

wqd_clean.to_csv(INTERIM / "wqd_clean.csv", index=False)
mon_clean.to_csv(INTERIM / "mon_clean.csv", index=False)

print("Saved interim clean CSVs:")
print(" -", INTERIM / "wqd_clean.csv")
print(" -", INTERIM / "mon_clean.csv")


Saved interim clean CSVs:
 - C:\Users\PC\Documents\Machine_Learning\Capstone_Project\Poseidon\data\interim\wqd_clean.csv
 - C:\Users\PC\Documents\Machine_Learning\Capstone_Project\Poseidon\data\interim\mon_clean.csv


Cell 14 — EDA issues log & summary text

In [14]:
issues = []

# Range flags summary
def range_summary_text(qcdf, name):
    for col,(lo,hi) in ranges.items():
        f = f"{col}_out_of_range"
        if f in qcdf.columns:
            pct = qcdf[f].mean()*100
            if pct > 0:
                issues.append(f"{name}: {pct:.2f}% of '{col}' values outside [{lo},{hi}].")

range_summary_text(wqd_qc, "WQD")
range_summary_text(mon_qc, "Monteria")

# Outliers notes
for _,row in wqd_out.iterrows():
    if row["pct_outlier_|z|>3.5"] > 0:
        issues.append(f"WQD: {row['column']} has {row['pct_outlier_|z|>3.5']:.2f}% robust-z outliers (>3.5).")
for _,row in mon_out.iterrows():
    if row["pct_outlier_|z|>3.5"] > 0:
        issues.append(f"Monteria: {row['column']} has {row['pct_outlier_|z|>3.5']:.2f}% robust-z outliers (>3.5).")

# Correlation presence
wqd_corr_note = "available" if isinstance(wqd_corr, pd.DataFrame) else "insufficient rows"
mon_corr_note = "available" if isinstance(mon_corr, pd.DataFrame) else "insufficient rows"

summary_md = f"""
# EDA Summary (00_data_understanding_eda)

## Datasets
- **WQD.xlsx** raw: {wqd_raw.shape}, canonical: {wqd.shape}
- **Monteria_Aquaculture_Data.xlsx** raw: {mon_raw.shape}, canonical: {mon.shape}

## Canonicalization
- **WQD**: temperature, pH, **turbidity_cm** → **turbidity_proxy = 1/cm**, do, ammonia, water_quality
- **Monteria**: timestamp, temperature, pH, **turbidity (NTU)**, do

## Key Outputs
- Histograms & correlations in **/images/**
- Interim clean CSVs in **/data/interim/**:
  - wqd_clean.csv
  - mon_clean.csv

## Correlations (Spearman)
- WQD: {wqd_corr_note}
- Monteria: {mon_corr_note}

## Outliers / Range Notes
- {"; ".join(issues) if issues else "No significant issues beyond broad domain envelopes."}

## Next Steps
1) Finalize **cleaning rules** (clip/winsorize, NA handling) per variable using this EDA.
2) Lock **feature space** for Virtual Sensors: inputs = [temperature, pH, turbidity_proxy].
3) Proceed to **01_soft_sensors_DO_NH3.ipynb** to train Virtual DO & NH3 with robust CV.
""".strip()

print(summary_md[:1000] + "\n...\n")


# EDA Summary (00_data_understanding_eda)

## Datasets
- **WQD.xlsx** raw: (4300, 15), canonical: (4300, 7)
- **Monteria_Aquaculture_Data.xlsx** raw: (4345, 7), canonical: (4345, 7)

## Canonicalization
- **WQD**: temperature, pH, **turbidity_cm** → **turbidity_proxy = 1/cm**, do, ammonia, water_quality
- **Monteria**: timestamp, temperature, pH, **turbidity (NTU)**, do

## Key Outputs
- Histograms & correlations in **/images/**
- Interim clean CSVs in **/data/interim/**:
  - wqd_clean.csv
  - mon_clean.csv

## Correlations (Spearman)
- WQD: available
- Monteria: available

## Outliers / Range Notes
- WQD: 2.33% of 'temperature' values outside [0,40].; WQD: 4.42% of 'pH' values outside [5.0,9.5].; WQD: temperature has 2.33% robust-z outliers (>3.5).; WQD: pH has 0.70% robust-z outliers (>3.5).; WQD: turbidity_proxy has 1.26% robust-z outliers (>3.5).; WQD: do has 1.26% robust-z outliers (>3.5).; WQD: ammonia has 4.47% robust-z outliers (>3.5).; Monteria: temperature has 0.05% robust-z 

Write EDA summary to /reports/eda_summary.md

In [15]:
(REPORTS / "eda_summary.md").write_text(summary_md)
print("Wrote:", REPORTS / "eda_summary.md")

Wrote: C:\Users\PC\Documents\Machine_Learning\Capstone_Project\Poseidon\reports\eda_summary.md
