# Traditional Time-Series Forecasting

This notebook implements a traditional time-series prediction baseline for Smart Home Air Monitoring.

**Goal:** Use the previous 30 seconds of multivariate sensor readings (R1â€“R8, Temp, Humidity) to predict the R4 sensor value 10 seconds ahead.

We frame forecasting as a supervised learning problem using sliding windows (lag features), then train and evaluate:
- Linear Regression (baseline)
- Ridge Regression (regularized baseline)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
def in_colab():
    return "COLAB_GPU" in os.environ or "google.colab" in sys.modules

if in_colab():
    # Running in Google Colab
    repo_path = "/content/MSAAI_530_FinalProject"
    data_path = "/content/MSAAI_530_FinalProject/data"

    # Set working directory to the repo root
    os.chdir(repo_path)

else:
    # Running locally in VS Code
    repo_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
    data_path = os.path.abspath(os.path.join(repo_path, "data"))

    # Add repo root to Python path
    if repo_path not in sys.path:
        sys.path.append(repo_path)

    # Set working directory to the repo root
    os.chdir(repo_path)

print("Using repo path:", repo_path)
print("Using data path:", data_path)
print("CWD:", os.getcwd())

In [None]:
# Load and basic cleanup
dataset_csv = os.path.join(data_path, "Cleaned_HT_Sensor_Dataset.csv")

# Load CSVs
df = pd.read_csv(dataset_csv, delimiter=",").sort_values(["id","time"]).reset_index(drop=True)

SENSOR_COLS = ["R1","R2","R3","R4","R5","R6","R7","R8","Temp","Humidity"]
TARGET = "R4"

# Time-series setup
WINDOW = 30   # past 30 seconds (lag window)
HORIZON = 10  # predict 10 seconds into the future

print("Setup complete.")

In [None]:
# Fill missing sensor values within each recording using forward/back fill
df[SENSOR_COLS] = (
    df.groupby("id")[SENSOR_COLS]
      .apply(lambda g: g.ffill().bfill())
      .reset_index(level=0, drop=True)
)

# Confirm no missing values remain for modeled columns
print("Dataset shape:", df.shape)
print("Missing values per column:")
print(df[SENSOR_COLS].isna().sum())

## 1. Convert time series to supervised learning (sliding windows)

For each recording id, we create many training examples:
- Input (X): the past window seconds of sensor readings (flattened)
- Target (y): the sensor value at horizon seconds into the future

This is the standard way to use traditional ML models for forecasting.


In [None]:
def make_windows(group: pd.DataFrame, feature_cols, target_col, window=30, horizon=10):
    """Create supervised learning samples from one recording (one id).

    X: [num_samples, window * num_features]
    y: [num_samples,]
    t: target timestamps (for plotting)
    """
    X_list, y_list, t_list = [], [], []
    Xdata = group[feature_cols].to_numpy()
    ydata = group[target_col].to_numpy()
    times = group["time"].to_numpy()

    n = len(group)
    max_start = n - window - horizon + 1

    for start in range(max_start):
        end = start + window
        target_idx = end + horizon - 1
        X_list.append(Xdata[start:end].reshape(-1))
        y_list.append(ydata[target_idx])
        t_list.append(times[target_idx])

    return np.array(X_list), np.array(y_list), np.array(t_list)

print("Window function ready.")

## 2. Train/Test split (time-aware)

To avoid data leakage, we split within each recording in time order:
- First 70% of windows -> training
- Last 30% of windows -> testing

This mimics real forecasting (training on earlier behavior, testing on later behavior).


In [None]:
X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []
meta_test = []  # (id, time) for plotting

for gid, g in df.groupby("id"):
    g = g.reset_index(drop=True)
    X, y, t = make_windows(g, SENSOR_COLS, TARGET, window=WINDOW, horizon=HORIZON)

    # Skip very short recordings
    if len(X) < 50:
        continue

    split = int(len(X) * 0.7)
    X_train_list.append(X[:split])
    y_train_list.append(y[:split])

    X_test_list.append(X[split:])
    y_test_list.append(y[split:])
    meta_test.extend([(gid, ti) for ti in t[split:]])

X_train = np.vstack(X_train_list)
y_train = np.concatenate(y_train_list)
X_test = np.vstack(X_test_list)
y_test = np.concatenate(y_test_list)

print("Train X:", X_train.shape, "Train y:", y_train.shape)
print("Test  X:", X_test.shape,  "Test  y:", y_test.shape)

## 3. Train traditional forecasting models

We train two lightweight regression baselines:
1. **Linear Regression**: simplest interpretable baseline
2. **Ridge Regression**: linear regression with L2 regularization (helps reduce overfitting)


In [None]:
models = {
    "LinearRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),
    "Ridge(alpha=1.0)": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge(alpha=1.0))
    ])
}

preds = {}
metrics = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    preds[name] = y_pred

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5

    metrics.append({"Model": name, "MAE": mae, "RMSE": rmse})

metrics_df = pd.DataFrame(metrics).sort_values("RMSE")
metrics_df

## 4. Plot predicted vs actual (one recording)

We plot predictions on a single test recording to show how well the model tracks the sensor trend.


In [None]:
meta = pd.DataFrame(meta_test, columns=["id","time"])
meta["actual"] = y_test
meta["pred_linreg"] = preds["LinearRegression"]
meta["pred_ridge"] = preds["Ridge(alpha=1.0)"]

# Choose the recording id with the most test samples
chosen_id = meta["id"].value_counts().idxmax()
sub = meta[meta["id"] == chosen_id].sort_values("time").head(600)

plt.figure()
plt.plot(sub["time"], sub["actual"], label="Actual R4")
plt.plot(sub["time"], sub["pred_linreg"], label="Predicted (LinearRegression)")
plt.plot(sub["time"], sub["pred_ridge"], label="Predicted (Ridge)")
plt.xlabel("Time (hours from induction start)")
plt.ylabel("R4 sensor resistance")
plt.title(f"{HORIZON}-second-ahead forecast for R4 (Recording id={chosen_id})")
plt.legend()
plt.tight_layout()
# plt.savefig("r4_forecast_plot.png", dpi=200)
plt.show()

#print("Saved plot: r4_forecast_plot.png")

## 5. Real time feasibility (IoT deployment note)

Because linear models are computationally lightweight, this approach is realistic for edge/gateway deployment:
- The device only keeps the most recent window seconds of readings in memory.
- Each prediction is a fast matrix operation (milliseconds on typical hardware).
- Results can be sent to the cloud/mobile app for visualization and alerting.
