In [None]:
# eda_temp_humidity.py
# Step-by-step EDA for iot_temp_humidity.csv
# Requires: pandas, numpy, matplotlib
# Run: python eda_temp_humidity.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

CSV_PATH = "iot_temp_humidity.csv"  # put CSV in same folder or change path

def main():
    assert os.path.exists(CSV_PATH), f"{CSV_PATH} not found. Place CSV here or change CSV_PATH."

    # 1. Read
    df = pd.read_csv(CSV_PATH)
    print(">>> 1) Preview (top 5 rows):")
    print(df.head(), "\n")

    # 2. Basic info
    print(">>> 2) dtypes & non-null counts:")
    print(df.info(), "\n")

    # 3. Summary statistics
    print(">>> 3) Summary statistics:")
    print(df.describe(), "\n")

    # 4. Missing values
    missing = df.isna().sum()
    missing_pct = (missing / len(df)) * 100
    print(">>> 4) Missing values (count and %):")
    print(pd.concat([missing.rename("missing_count"), missing_pct.rename("missing_pct")], axis=1), "\n")

    # 5. Duplicates
    print(">>> 5) Duplicate rows:", df.duplicated().sum(), "\n")

    # 6. Convert time to datetime and set index
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"])
        df = df.set_index("time").sort_index()
        print(">>> 6) Converted 'time' to datetime and set as index.\n")
    else:
        print(">>> 6) No 'time' column found — proceeding without time index.\n")

    # 7. Plots: time-series
    plt.figure(figsize=(10,3))
    plt.plot(df.index, df["temperature"], marker="o", linestyle="-")
    plt.title("Temperature over time")
    plt.xlabel("time"); plt.ylabel("°C"); plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10,3))
    plt.plot(df.index, df["humidity"], marker="o", linestyle="-")
    plt.title("Humidity over time")
    plt.xlabel("time"); plt.ylabel("%"); plt.tight_layout()
    plt.show()

    # 8. Histograms
    plt.figure(figsize=(6,3))
    plt.hist(df["temperature"], bins=8)
    plt.title("Temperature distribution"); plt.xlabel("°C"); plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6,3))
    plt.hist(df["humidity"], bins=8)
    plt.title("Humidity distribution"); plt.xlabel("%"); plt.tight_layout()
    plt.show()

    # 9. Boxplots
    plt.figure(figsize=(6,2.5))
    plt.boxplot(df["temperature"].dropna(), vert=False)
    plt.title("Temperature boxplot"); plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6,2.5))
    plt.boxplot(df["humidity"].dropna(), vert=False)
    plt.title("Humidity boxplot"); plt.tight_layout()
    plt.show()

    # 10. Rolling average (3-reading)
    df["temp_ma_3"] = df["temperature"].rolling(window=3, min_periods=1).mean()
    plt.figure(figsize=(10,3))
    plt.plot(df.index, df["temperature"], label="temperature")
    plt.plot(df.index, df["temp_ma_3"], label="3-reading MA")
    plt.title("Temperature and 3-reading moving average")
    plt.legend(); plt.tight_layout(); plt.show()

    # 11. Correlation
    corr = df[["temperature", "humidity"]].corr()
    print(">>> 11) Correlation matrix (temperature vs humidity):")
    print(corr, "\n")

    # 12. Resample daily (mean/min/max) - safe even for small data
    try:
        daily = df.resample("D").agg(["mean", "min", "max"])
        print(">>> 12) Daily aggregated stats:")
        print(daily, "\n")
    except Exception:
        print(">>> 12) Resample failed (no datetime index). Skipping.\n")

    # 13. Quick numeric summary & basic anomaly flags
    summary = {
        "n_rows": len(df),
        "temp_mean": df["temperature"].mean(),
        "temp_std": df["temperature"].std(),
        "temp_min": df["temperature"].min(),
        "temp_max": df["temperature"].max(),
        "hum_mean": df["humidity"].mean(),
        "hum_std": df["humidity"].std(),
        "hum_min": df["humidity"].min(),
        "hum_max": df["humidity"].max(),
        "temp_above_28_count": int((df["temperature"] > 28).sum()),
        "temp_below_22_count": int((df["temperature"] < 22).sum())
    }
    print(">>> 13) Quick numeric summary:")
    for k,v in summary.items():
        print(f"{k}: {v}")
    print()

    # 14. Simple anomaly detection (threshold and rolling z-score)
    # threshold-based
    df["anomaly_threshold"] = ((df["temperature"] > 28) | (df["temperature"] < 22))

    # rolling z-score (window 6)
    roll_mean = df["temperature"].rolling(window=6, min_periods=1).mean()
    roll_std = df["temperature"].rolling(window=6, min_periods=1).std().fillna(0.0001)
    df["temp_zscore"] = (df["temperature"] - roll_mean) / roll_std
    df["anomaly_zscore"] = df["temp_zscore"].abs() > 3

    anomalies = df[df["anomaly_threshold"] | df["anomaly_zscore"]]
    print(">>> 14) Anomalies detected (threshold or zscore):", len(anomalies))
    if not anomalies.empty:
        print(anomalies[["temperature","humidity","anomaly_threshold","temp_zscore","anomaly_zscore"]])
        # Save anomalies
        anomalies.reset_index().to_csv("temp_hum_anomalies.csv", index=False)
        print("Saved anomalies to temp_hum_anomalies.csv\n")
    else:
        print("No anomalies found.\n")

    # 15. Save cleaned/annotated file
    df.reset_index().to_csv("temp_hum_eda_annotated.csv", index=False)
    print("Annotated EDA results saved to temp_hum_eda_annotated.csv")

if __name__ == "__main__":
    main()
