# ADIA Refactored – Getting Started

This notebook gives a gentle intro to working with the `adia_refactored` package from a notebook environment. It covers:

- Ensuring your environment can import the package
- A quick single-series demo
- A small batch demo using Parquet files
- (Optional) Running the included example script

If you see import errors for `numpy`/`pandas`/`pyarrow`, run the optional install cell below and then restart the kernel.


In [1]:
# Optional: install project dependencies into the active kernel
# If imports fail later, run this, then restart the kernel and re-run
import sys
import subprocess

def ensure_deps():
    try:
        import numpy  # noqa: F401
        import pandas  # noqa: F401
        import pyarrow  # noqa: F401
        print("Dependencies appear to be installed. Skipping install.")
        return
    except Exception:
        pass
    print("Installing requirements...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])  # nosec

ensure_deps()


Dependencies appear to be installed. Skipping install.


In [2]:
# Setup: make package importable and validate config
import sys, pathlib

project_root = pathlib.Path().resolve()
parent_dir = project_root.parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

from adia_refactored import compute_predictors_for_values, run_batch, validate_config
validate_config()
print("Imports OK and config validated.")


Imports OK and config validated.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Single-series demo
import numpy as np

np.random.seed(42)
n = 120
bp = 60
values = np.concatenate([
    np.random.normal(0, 1.0, bp),
    np.random.normal(1.5, 1.5, n - bp),
])
periods = np.concatenate([np.zeros(bp), np.ones(n - bp)])

preds, meta = compute_predictors_for_values(values, periods, B_boot=20, energy_enable=False)
preds, meta


({'p_mu_lag1': 1.0,
  'p_sigma_lag1': 0.19047619047619047,
  'overlap_frac_lag1': 0.475,
  'p_mu_vol': 1.0,
  'p_sigma_vol': 0.14285714285714285,
  'overlap_frac_vol': 0.475,
  'p_mu_resid_lag1': 1.0,
  'p_sigma_resid_lag1': 0.42857142857142855,
  'overlap_frac_resid_lag1': 0.4,
  'p_mean': 2.553060822633691e-14,
  'p_var': 0.003419625023376895,
  'p_MWU': 4.7077819611781626e-11,
  'p_energy': nan,
  'acf_absdiff_l1': 0.32185753393519023},
 {'n_total_lag1': 119,
  'n_p0_lag1': 59,
  'n_p1_lag1': 60,
  'n_total_vol': 118,
  'n_p0_vol': 58,
  'n_p1_vol': 60,
  'n_period0': 59,
  'n_period1': 60,
  'n_total': 119})

In [4]:
# Batch demo (writes and reads Parquet files under _tmp_notebook)
from pathlib import Path
import numpy as np
import pandas as pd

# Prepare synthetic batch
tmp_dir = Path("_tmp_notebook"); tmp_dir.mkdir(exist_ok=True)
frames = []
for i in range(4):
    np.random.seed(100 + i)
    n = 80; bp = 40
    if i % 2 == 0:
        vals = np.concatenate([np.random.normal(0, 1.0, bp),
                               np.random.normal(1.2, 1.0, n - bp)])
    else:
        vals = np.concatenate([np.random.normal(0, 0.8, bp),
                               np.random.normal(0, 1.8, n - bp)])
    periods = np.concatenate([np.zeros(bp), np.ones(n - bp)])
    df = pd.DataFrame({"value": vals, "period": periods})
    df.index = pd.MultiIndex.from_tuples([(f"series_{i}", t) for t in range(n)], names=["id", "time"])
    frames.append(df)

all_df = pd.concat(frames).sort_index()

in_path = tmp_dir / "sample_batch.parquet"
out_pred = tmp_dir / "sample_predictors.parquet"
out_meta = tmp_dir / "sample_metadata.parquet"

all_df.to_parquet(in_path)

pred_df, meta_df = run_batch(
    input_parquet=str(in_path),
    out_pred_parquet=str(out_pred),
    out_meta_parquet=str(out_meta),
    B_boot=20,
    energy_enable=False,
    n_jobs=1,
    verbose=False,
)

print(f"Processed series: {len(pred_df)}")
print(f"Predictor columns: {len(pred_df.columns)}")
pred_df.head(), meta_df.head()


INFO:adia_refactored.batch_processor:Loading _tmp_notebook\sample_batch.parquet
INFO:adia_refactored.batch_processor:Total series: 4
INFO:adia_refactored.batch_processor:Saved predictors to: _tmp_notebook\sample_predictors.parquet
INFO:adia_refactored.batch_processor:Saved metadata to: _tmp_notebook\sample_metadata.parquet


Processed series: 4
Predictor columns: 14


(          p_mu_lag1  p_sigma_lag1  overlap_frac_lag1  p_mu_vol  p_sigma_vol  \
 id                                                                            
 series_0        1.0      0.380952              0.525  0.238095     0.047619   
 series_1        1.0      0.047619              0.325  1.000000     0.190476   
 series_2        1.0      0.238095              0.500  1.000000     0.952381   
 series_3        1.0      0.857143              0.350  0.142857     0.476190   
 
           overlap_frac_vol  p_mu_resid_lag1  p_sigma_resid_lag1  \
 id                                                                
 series_0             0.700              1.0            0.095238   
 series_1             0.400              1.0            0.285714   
 series_2             0.675              1.0            0.238095   
 series_3             0.625              1.0            0.761905   
 
           overlap_frac_resid_lag1    p_mean     p_var     p_MWU  p_energy  \
 id                           

In [None]:
# Optional: run the included script and (optionally) clean up temp files
print("Running example_usage.py ...")
%run example_usage.py

# Optional cleanup for this notebook's temp files
from pathlib import Path
for p in Path("_tmp_notebook").glob("*.parquet"):
    try:
        p.unlink()
    except Exception:
        pass


In [5]:
# Real-data batch run
from pathlib import Path
import os

# Input files (edit paths if needed)
input_parquet = r"C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\X_train.parquet"
out_pred_parquet = r"C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\X_train_predictors.parquet"
out_meta_parquet = r"C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\X_train_metadata.parquet"

print("Reading:", input_parquet)

pred_df, meta_df = run_batch(
    input_parquet=input_parquet,
    out_pred_parquet=out_pred_parquet,
    out_meta_parquet=out_meta_parquet,
    B_boot=80,
    energy_enable=False,
    n_jobs=1,
    verbose=True,
)

print("Saved predictors to:", out_pred_parquet)
print("Saved metadata to:", out_meta_parquet)
print("Predictors shape:", pred_df.shape)
print("Metadata shape:", meta_df.shape)
pred_df.head()


INFO:adia_refactored.batch_processor:Loading C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\X_train.parquet


Reading: C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\X_train.parquet


INFO:adia_refactored.batch_processor:Total series: 10001
Extracting predictors:   6%|▌         | 591/10001 [1:29:20<5:39:07,  2.16s/it]   

KeyboardInterrupt: 

In [None]:
# Merge predictors with labels for inspection
import pandas as pd

labels_csv = r"C:\Users\yehud\OneDrive\Curriculum Vitae\ADIA\StructualBreak\y_train.csv"
print("Reading labels:", labels_csv)
labels = pd.read_csv(labels_csv)

# Try common label column names; adjust if needed
possible_label_cols = [
    'structural', 'structural_', 'is_structural', 'label', 'target'
]
label_col = None
for c in possible_label_cols:
    if c in labels.columns:
        label_col = c
        break
if label_col is None:
    print("Available columns in labels:", list(labels.columns))
    raise ValueError("Could not find a label column. Set label_col to the correct name.")

# Ensure id is index to align with pred_df which is indexed by id
if 'id' in labels.columns:
    labels = labels.set_index('id')

merged = pred_df.join(labels[[label_col]], how='inner')
print("Merged shape:", merged.shape)
merged.head()
