In [8]:
import os
import math
import json
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score
from scipy.special import expit  # logistic transform
from statsmodels.tsa.statespace.sarimax import SARIMAX

# ---------- Config (user-editable) ----------
DATA_PATH = "/content/ABCB.csv"   # your uploaded file (use this exact path)
RESULTS_DIR = "./results_htm"
TARGET_COL = "Close"               # change if you prefer another column (e.g., "Adjusted Close")
FEATURES = ["Open", "High", "Low", "Close", "Volume"]  # at least 5 interdependent features; adjust if needed
TEST_FRACTION = 0.2
RANDOM_SEED = 42
EPOCHS = 5                        # number of epochs (full passes over training data) for HTM
SP_COLUMNS = 1024
SP_ACTIVE_PER_INH = 40
TM_CELLS_PER_COL = 16
ENCODER_N = 100
ENCODER_W = 21
SARIMAX_ORDER = (3,0,2)           # baseline ARIMA order
# --------------------------------------------

os.makedirs(RESULTS_DIR, exist_ok=True)
np.random.seed(RANDOM_SEED)

# --- Load data ----------------------------------------------------------------
df = pd.read_csv(DATA_PATH)
print("Loaded data shape:", df.shape)
print("Columns:", df.columns.tolist())

# Validate feature columns exist
for c in FEATURES:
    if c not in df.columns:
        raise ValueError(f"Feature '{c}' not found in dataset. Available cols: {df.columns.tolist()}")

if TARGET_COL not in FEATURES:
    FEATURES = [c for c in FEATURES if c != TARGET_COL] + [TARGET_COL]

# Keep only numeric features (drop Date)
df_numeric = df[FEATURES].copy().apply(pd.to_numeric, errors='coerce')
df_numeric = df_numeric.fillna(method='ffill').fillna(method='bfill').fillna(0.0)
n = len(df_numeric)
print("Using numeric dataframe shape:", df_numeric.shape)

# Train/test split (by index/time order)
split_idx = int((1 - TEST_FRACTION) * n)
train_df = df_numeric.iloc[:split_idx].reset_index(drop=True)
test_df = df_numeric.iloc[split_idx:].reset_index(drop=True)

# Scale features for encoder stability (we will keep original target for evaluation)
scaler = StandardScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
test_scaled = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

# --- HTM imports with fallbacks ------------------------------------------------
HTM_AVAILABLE = True
try:
    from htm.bindings.sdr import SDR
    from htm.algorithms.anomaly_likelihood import AnomalyLikelihood
    # Try actual bindings for SP/TM and encoders (names differ by version)
    try:
        from htm.bindings.algorithms import SpatialPooler as SpatialPoolerBinding
        from htm.bindings.algorithms import TemporalMemory as TemporalMemoryBinding
    except Exception:
        # some versions
        from htm.bindings.algorithms import SP as SpatialPoolerBinding
        from htm.bindings.algorithms import TM as TemporalMemoryBinding
    try:
        from htm.bindings.encoders import ScalarEncoder as ScalarEncoderBinding
    except Exception:
        from htm.bindings.encoders import ScalarEncoder as ScalarEncoderBinding
    print("htm.core bindings found; using native HTM implementations.")
except Exception:
    HTM_AVAILABLE = False
    print("htm.core not found. Using lightweight HTM-like fallbacks (script will still run).")

    # Minimal SDR fallback
    class SDR:
        def __init__(self, size):
            self.size = size
            self.bits = np.zeros(size, dtype=np.int32)
        def setDense(self, arr):
            arr = np.asarray(arr)
            if arr.size != self.size:
                raise ValueError("SDR.setDense size mismatch")
            self.bits = (arr != 0).astype(np.int32)
        def dense(self):
            return self.bits.copy()

    # Minimal ScalarEncoder fallback
    class ScalarEncoderBinding:
        def __init__(self, n=ENCODER_N, w=ENCODER_W, minval=0.0, maxval=1.0):
            self.n = int(n)
            self.w = int(w)
            if self.n <= self.w:
                raise ValueError("Encoder n must be > w")
            self.minval = float(minval)
            self.maxval = float(maxval)
            self.buckets = np.linspace(self.minval, self.maxval, self.n - self.w + 1)
        def encode(self, value):
            v = float(value)
            pos = np.searchsorted(self.buckets, v)
            out = np.zeros(self.n, dtype=np.int32)
            start = max(0, min(self.n - self.w, pos))
            out[start:start + self.w] = 1
            return out
        def getWidth(self):
            return self.n

    # Very simple SpatialPooler fallback
    class SpatialPoolerBinding:
        def __init__(self, inputDimensions, columnDimensions, potentialPct=0.85,
                     globalInhibition=True, numActiveColumnsPerInhArea=40):
            self.inputDimensions = inputDimensions
            self.columnDimensions = columnDimensions
            self.numActiveColumnsPerInhArea = int(numActiveColumnsPerInhArea)
            self.columns = columnDimensions[0]
        def compute(self, input_sdr, learn, activeArray):
            # naive hashing-based selection of active columns
            arr = input_sdr.dense() if hasattr(input_sdr, "dense") else np.asarray(input_sdr)
            rng = np.random.RandomState(int(arr.sum()) + 1)
            idx = rng.choice(self.columns, self.numActiveColumnsPerInhArea, replace=False)
            activeArray[:] = 0
            activeArray[idx] = 1
        def getColumnDimensions(self):
            return (self.columns,)

    # Very simple TemporalMemory fallback
    class TemporalMemoryBinding:
        def __init__(self, columnDimensions, cellsPerColumn=TM_CELLS_PER_COL,
                     activationThreshold=12, initialPermanence=0.21, connectedPermanence=0.5):
            self.columns = columnDimensions[0]
            self.cellsPerColumn = cellsPerColumn
            self.active_cells = set()
        def compute(self, activeColumns, learn=True):
            cols = np.where(np.asarray(activeColumns) > 0)[0]
            self.active_cells = set([int(c)*self.cellsPerColumn for c in cols])
        def getActiveCells(self):
            return np.array(sorted(self.active_cells))
        def getMaxPermutationCount(self):
            return max(1, len(self.active_cells))
    # Minimal anomaly likelihood stub
    class AnomalyLikelihood:
        def __init__(self):
            pass
        def anomalyLikelihood(self, score, timestamp=None):
            # map to 0..1 via logistic normalization
            return float(expit(score / (1.0 + abs(score))))

# --- Build encoders ----------------------------------------------------------
encoders = {}
for col in train_scaled.columns:
    vmin = float(min(train_scaled[col].min(), test_scaled[col].min()))
    vmax = float(max(train_scaled[col].max(), test_scaled[col].max()))
    enc = ScalarEncoderBinding(n=ENCODER_N, w=ENCODER_W, minval=vmin, maxval=vmax)
    encoders[col] = enc

# compute input_dim
try:
    input_dim = sum(enc.getWidth() for enc in encoders.values())
except Exception:
    input_dim = sum(getattr(enc, "n", ENCODER_N) for enc in encoders.values())

print("Input SDR dimension:", input_dim)

# instantiate SDR and HTM components
sdr = SDR(input_dim)
sp = SpatialPoolerBinding(inputDimensions=(input_dim,), columnDimensions=(SP_COLUMNS,),
                          potentialPct=0.85, globalInhibition=True, numActiveColumnsPerInhArea=SP_ACTIVE_PER_INH)
tm = TemporalMemoryBinding(columnDimensions=(SP_COLUMNS,), cellsPerColumn=TM_CELLS_PER_COL)

# Anomaly likelihood helper (if binding available)
try:
    anomaly_lik = AnomalyLikelihood()
except Exception:
    anomaly_lik = AnomalyLikelihood()

# associative predictor: map active columns bitmask -> list(next_target_values)
assoc_predictor = {}

def encode_row_to_sdr(row_scaled):
    parts = []
    for name in train_scaled.columns:
        val = float(row_scaled[name])
        parts.append(np.asarray(encoders[name].encode(val), dtype=np.int32))
    arr = np.concatenate(parts).astype(np.int32)
    if arr.size != input_dim:
        if arr.size < input_dim:
            pad = np.zeros(input_dim - arr.size, dtype=np.int32)
            arr = np.concatenate([arr, pad])
        else:
            arr = arr[:input_dim]
    return arr

def active_cols_key(active_array):
    arr = np.asarray(active_array)
    # pack bits to bytes to make a hashable key
    packed = np.packbits((arr > 0).astype(np.uint8))
    return packed.tobytes()

# --- HTM training loop with epochs ------------------------------------------
train_len = len(train_scaled)
print(f"Training size: {train_len}, Test size: {len(test_scaled)}, epochs: {EPOCHS}")

htm_train_preds = None
start_time = time.time()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS} ...")
    epoch_preds = []
    for t in range(train_len - 1):
        row = train_scaled.iloc[t]
        next_true_orig = train_df[TARGET_COL].iloc[t + 1]  # original scale next-step
        enc_vec = encode_row_to_sdr(row)
        sdr.setDense(enc_vec)

        # compute SP active columns (output array)
        active_cols = np.zeros(SP_COLUMNS, dtype=np.int32)
        try:
            sp.compute(sdr, True, active_cols)
        except Exception:
            sp.compute(enc_vec, True, active_cols)

        # compute TM
        try:
            tm.compute(active_cols, learn=True)
        except Exception:
            tm.compute(active_cols)

        # predictor via associative map
        key = active_cols_key(active_cols)
        if key in assoc_predictor and len(assoc_predictor[key]) > 0:
            pred = float(np.mean(assoc_predictor[key]))
        else:
            # fallback: persistence on the target (previous value)
            pred = float(train_df[TARGET_COL].iloc[t])

        epoch_preds.append(pred)
        # update associative predictor with the observed next target (original scale)
        assoc_predictor.setdefault(key, []).append(float(next_true_orig))

    # compute epoch RMSE on training portion (aligned)
    train_truth_aligned = train_df[TARGET_COL].iloc[1:len(epoch_preds)+1].values
    epoch_rmse = math.sqrt(mean_squared_error(train_truth_aligned, epoch_preds))
    print(f"  epoch {epoch+1} RMSE (train online): {epoch_rmse:.6f}")

    # keep last epoch preds for reporting
    htm_train_preds = epoch_preds

train_time = time.time() - start_time
print(f"HTM epochs finished in {train_time:.1f}s")

# --- Produce HTM predictions on test set (online continutation) --------------
# Prime with last train row
last_train_row = train_scaled.iloc[-1]
sdr.setDense(encode_row_to_sdr(last_train_row))
_dummy_active = np.zeros(SP_COLUMNS, dtype=np.int32)
try:
    sp.compute(sdr, False, _dummy_active)
except Exception:
    pass

test_len = len(test_scaled)
htm_test_preds = []
htm_htm_anomaly_scores = []
htm_anomaly_likelihoods = []

for t in range(test_len - 1):
    row = test_scaled.iloc[t]
    true_next = float(test_df[TARGET_COL].iloc[t + 1])

    enc_vec = encode_row_to_sdr(row)
    sdr.setDense(enc_vec)

    active_cols = np.zeros(SP_COLUMNS, dtype=np.int32)
    try:
        sp.compute(sdr, False, active_cols)
    except Exception:
        sp.compute(enc_vec, False, active_cols)

    try:
        tm.compute(active_cols, learn=False)
    except Exception:
        tm.compute(active_cols)

    key = active_cols_key(active_cols)
    if key in assoc_predictor and len(assoc_predictor[key]) > 0:
        pred_next = float(np.mean(assoc_predictor[key]))
    else:
        pred_next = float(test_df[TARGET_COL].iloc[t])  # persistence fallback

    htm_test_preds.append(pred_next)

    # compute HTM intrinsic anomaly score (fallback if TM doesn't provide numeric score)
    try:
        active_cells = tm.getActiveCells()
        anomaly_score = 1.0 - (len(active_cells) / max(1.0, tm.getMaxPermutationCount()))
    except Exception:
        # fallback to error-based raw score
        anomaly_score = abs(true_next - pred_next)

    # anomaly likelihood
    try:
        a_like = anomaly_lik.anomalyLikelihood(anomaly_score)
    except Exception:
        try:
            a_like = anomaly_lik.anomalyLikelihood(anomaly_score, timestamp=None)
        except Exception:
            a_like = float(expit(-anomaly_score))  # fallback mapping

    htm_htm_anomaly_scores.append(anomaly_score)
    htm_anomaly_likelihoods.append(a_like)

# Align arrays for metric computation (we have predictions t->t+1 for indices 0..test_len-2)
test_truth_aligned = test_df[TARGET_COL].iloc[1:len(htm_test_preds)+1].values
htm_test_rmse = math.sqrt(mean_squared_error(test_truth_aligned, htm_test_preds))
htm_test_mae = np.mean(np.abs(test_truth_aligned - np.array(htm_test_preds)))
print(f"HTM test RMSE: {htm_test_rmse:.6f}, MAE: {htm_test_mae:.6f}")

# --- SARIMAX baseline (fit on train original-scale target series) -----------
train_series = train_df[TARGET_COL].reset_index(drop=True)
test_series = test_df[TARGET_COL].reset_index(drop=True)

sarima_model = SARIMAX(train_series, order=SARIMAX_ORDER, enforce_stationarity=False, enforce_invertibility=False)
sarima_res = sarima_model.fit(disp=False)
# Forecast for full test range
sarima_forecast_res = sarima_res.get_forecast(steps=len(test_series))
sarima_mean = sarima_forecast_res.predicted_mean.values  # length = len(test_series)
sarima_ci = sarima_forecast_res.conf_int(alpha=0.05)    # DataFrame with lower/upper columns

# Align SARIMAX one-step alignment as we used for HTM (drop last pred)
sarima_test_preds_aligned = sarima_mean[:-1] if len(sarima_mean) > 1 else sarima_mean
sarima_truth_aligned = test_series.iloc[1:].values if len(test_series) > 1 else test_series.values

try:
    sarima_rmse = math.sqrt(mean_squared_error(sarima_truth_aligned, sarima_test_preds_aligned))
    sarima_mae = np.mean(np.abs(sarima_truth_aligned - sarima_test_preds_aligned))
except Exception:
    sarima_rmse = float('nan')
    sarima_mae = float('nan')

print(f"SARIMAX test RMSE (aligned): {sarima_rmse:.6f}, MAE: {sarima_mae:.6f}")

# Convert SARIMAX CI width to confidence-probability (smaller CI -> higher confidence)
sarima_ci_width = (sarima_ci.iloc[:,1] - sarima_ci.iloc[:,0]).values
# Align sarima_ci_width to aligned preds (drop last)
if len(sarima_ci_width) > 1:
    sarima_ci_width_aligned = sarima_ci_width[:-1]
else:
    sarima_ci_width_aligned = sarima_ci_width
# map width -> [0,1] probability via negative logistic
sarima_conf_prob = expit(- (sarima_ci_width_aligned / (np.std(sarima_ci_width_aligned) + 1e-8)))

# --- Error-based anomaly probabilities (for HTM & SARIMAX) -------------------
htm_errors = np.abs(test_truth_aligned - np.array(htm_test_preds))
# logistic probability where larger error -> higher anomaly probability
htm_error_prob = expit((htm_errors - np.mean(htm_errors)) / (np.std(htm_errors) + 1e-8))

if len(sarima_test_preds_aligned) == len(sarima_truth_aligned):
    sarima_errors = np.abs(sarima_truth_aligned - sarima_test_preds_aligned)
    sarima_error_prob = expit((sarima_errors - np.mean(sarima_errors)) / (np.std(sarima_errors) + 1e-8))
else:
    sarima_error_prob = np.array([np.nan]*len(sarima_conf_prob))

# --- Anomaly detection metrics using a threshold on error_prob (example 0.9) ---
threshold = 0.9
htm_pred_anoms = (htm_error_prob > threshold).astype(int)
# define true anomalies as points where error > mean + 3*std (3-sigma rule)
true_anoms = (htm_errors > (np.mean(htm_errors) + 3 * np.std(htm_errors))).astype(int)

if true_anoms.sum() > 0:
    htm_prec = precision_score(true_anoms, htm_pred_anoms, zero_division=0)
    htm_rec = recall_score(true_anoms, htm_pred_anoms, zero_division=0)
else:
    htm_prec = float('nan')
    htm_rec = float('nan')

# For sarima, align shapes before computing
if len(sarima_error_prob) == len(true_anoms):
    sarima_pred_anoms = (sarima_error_prob > threshold).astype(int)
    if true_anoms.sum() > 0:
        sarima_prec = precision_score(true_anoms, sarima_pred_anoms, zero_division=0)
        sarima_rec = recall_score(true_anoms, sarima_pred_anoms, zero_division=0)
    else:
        sarima_prec = float('nan')
        sarima_rec = float('nan')
else:
    sarima_prec = sarima_rec = float('nan')

# --- Save results to CSV ----------------------------------------------------
results_df = pd.DataFrame({
    "actual_next": test_truth_aligned,
    "htm_pred": np.array(htm_test_preds),
    "htm_error_abs": htm_errors,
    "htm_error_prob": htm_error_prob,
    "htm_anomaly_likelihood": np.array(htm_anomaly_likelihoods),
})

# add sarima columns when available/aligned
if len(sarima_test_preds_aligned) == len(sarima_truth_aligned):
    results_df["sarima_pred"] = sarima_test_preds_aligned
    results_df["sarima_error_prob"] = sarima_error_prob
    results_df["sarima_conf_prob"] = sarima_conf_prob
else:
    # pad with NaNs if mismatch
    results_df["sarima_pred"] = np.nan
    results_df["sarima_error_prob"] = np.nan
    results_df["sarima_conf_prob"] = np.nan

csv_path = os.path.join(RESULTS_DIR, "probabilities_and_predictions.csv")
results_df.to_csv(csv_path, index=False)
print("Saved results CSV to:", csv_path)

# --- Save model snapshot (associative predictor + hyperparams) -------------
snapshot = {
    "assoc_predictor_sample_count": len(assoc_predictor),
    "assoc_predictor_sample_keys": [k.hex() for k in list(assoc_predictor.keys())[:10]], # Convert bytes to hex string
    "hyperparameters": {
        "SP_COLUMNS": SP_COLUMNS,
        "SP_ACTIVE_PER_INH": SP_ACTIVE_PER_INH,
        "TM_CELLS_PER_COL": TM_CELLS_PER_COL,
        "EPOCHS": EPOCHS,
        "ENCODER_N": ENCODER_N, "ENCODER_W": ENCODER_W
    },
    "train_time_seconds": train_time,
    "htm_train_rmse_last_epoch": epoch_rmse
}
with open(os.path.join(RESULTS_DIR, "htm_snapshot.pkl"), "wb") as f:
    pickle.dump(snapshot, f)
with open(os.path.join(RESULTS_DIR, "htm_snapshot_summary.json"), "w") as f:
    json.dump(snapshot, f, indent=2)

# --- Plots: forecast overlay and anomaly probability timeline ----------------
plt.figure(figsize=(12,5))
# actual full test series
plt.plot(range(len(test_series)), test_series.values, label="Actual (test full)")
# HTM preds shifted +1 because preds are for next step
plt.plot(range(1, 1 + len(htm_test_preds)), htm_test_preds, label="HTM one-step preds (shifted +1)")
# SARIMAX overlay if available
if len(sarima_test_preds_aligned) == len(sarima_truth_aligned):
    plt.plot(range(1, 1 + len(sarima_test_preds_aligned)), sarima_test_preds_aligned, label="SARIMAX one-step preds (shifted +1)")
plt.xlabel("Index in test partition")
plt.ylabel(TARGET_COL)
plt.title("One-step-ahead Forecasts (Actual vs HTM vs SARIMAX)")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(RESULTS_DIR, "forecast_overlay.png"), bbox_inches='tight')
plt.close()
print("Saved forecast overlay to:", os.path.join(RESULTS_DIR, "forecast_overlay.png"))

# Anomaly probability timeline
plt.figure(figsize=(12,4))
x = range(1, 1 + len(htm_error_prob))
plt.plot(x, htm_anomaly_likelihoods, label="HTM anomaly likelihood")
plt.plot(x, htm_error_prob, label="HTM error-based prob")
if len(sarima_conf_prob) == len(x):
    plt.plot(x, sarima_conf_prob, label="SARIMAX CI->prob (aligned)")
if len(sarima_error_prob) == len(x):
    plt.plot(x, sarima_error_prob, label="SARIMAX error-based prob (aligned)")
plt.xlabel("Index in test partition (prediction -> actual next)")
plt.ylabel("Probability")
plt.title("Anomaly / Confidence Probabilities over Test Window")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(RESULTS_DIR, "anomaly_probability_timeline.png"), bbox_inches='tight')
plt.close()
print("Saved anomaly probability timeline to:", os.path.join(RESULTS_DIR, "anomaly_probability_timeline.png"))

# --- Summary outputs to stdout ------------------------------------------------
summary = {
    "htm_test_rmse": htm_test_rmse,
    "htm_test_mae": htm_test_mae,
    "sarima_rmse": sarima_rmse,
    "sarima_mae": sarima_mae,
    "htm_anomaly_precision_at_0.9": htm_prec,
    "htm_anomaly_recall_at_0.9": htm_rec,
    "sarima_anomaly_precision_at_0.9": sarima_prec,
    "sarima_anomaly_recall_at_0.9": sarima_rec,
    "results_csv": csv_path,
    "plots_dir": os.path.abspath(RESULTS_DIR)
}

print("\nSUMMARY:")
print(json.dumps(summary, indent=2))

# Save summary json
with open(os.path.join(RESULTS_DIR, "summary_metrics.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("All done. Check results in:", os.path.abspath(RESULTS_DIR))

Loaded data shape: (7194, 7)
Columns: ['Date', 'Low', 'Open', 'Volume', 'High', 'Close', 'Adjusted Close']
Using numeric dataframe shape: (7194, 5)
htm.core not found. Using lightweight HTM-like fallbacks (script will still run).
Input SDR dimension: 500
Training size: 5755, Test size: 1439, epochs: 5
Epoch 1/5 ...
  epoch 1 RMSE (train online): 7.960944
Epoch 2/5 ...
  epoch 2 RMSE (train online): 7.960367
Epoch 3/5 ...
  epoch 3 RMSE (train online): 7.960212
Epoch 4/5 ...
  epoch 4 RMSE (train online): 7.960143
Epoch 5/5 ...
  epoch 5 RMSE (train online): 7.960103
HTM epochs finished in 44.7s
HTM test RMSE: 30.340663, MAE: 28.913234
SARIMAX test RMSE (aligned): 25.493543, MAE: 21.318417
Saved results CSV to: ./results_htm/probabilities_and_predictions.csv
Saved forecast overlay to: ./results_htm/forecast_overlay.png
Saved anomaly probability timeline to: ./results_htm/anomaly_probability_timeline.png

SUMMARY:
{
  "htm_test_rmse": 30.340662818691715,
  "htm_test_mae": 28.913234318391