# PAMAP2 ➜ TS2Vec ➜ Fractal‑SSL  
An interactive, step‑by‑step Jupyter workflow.

**Sections**
1. Inspect & preprocess PAMAP2
2. Pre‑train a TS2Vec backbone
3. Enhance with Fractal‑SSL


## 0  Setup

In [1]:
# ⚠️ Run once per environment
%pip install -q torch==2.3.0 ts2vec pandas scikit-learn tqdm hydra-core


Note: you may need to restart the kernel to use updated packages.


## 1  Imports & paths

In [1]:
import os, glob, json, math, random, itertools, pickle
from pathlib import Path
import numpy as np, pandas as pd, torch
from tqdm import tqdm

from ts2vec import TS2Vec

RAW_ROOT   = Path('/Users/ai421/UK Dementia Research Institute Dropbox/Anastasia Ilina/FractalSSL/data/pamap2+physical+activity+monitoring/PAMAP2_Dataset').expanduser()   # <- edit if needed
CACHE_ROOT = Path('./cache')
CACHE_ROOT.mkdir(exist_ok=True)


In [2]:

RAW_ROOT 

PosixPath('/Users/ai421/UK Dementia Research Institute Dropbox/Anastasia Ilina/FractalSSL/data/pamap2+physical+activity+monitoring/PAMAP2_Dataset')

### Helper to read a single PAMAP2 `.dat` file

In [3]:
COLS = [
    "timestamp", "activity_id", "heart_rate",
    # IMU columns...
]  # shortened for brevity – full list in spec

def load_dat(path: Path, cols=COLS, downsample_hr=True):
    """Returns a DataFrame (100 Hz) with NaNs handled."""
    df = pd.read_csv(path, sep=' ', header=None)
    df.columns = cols + [f'col_{i}' for i in range(len(df.columns)-len(cols))]

    # Optional: down‑sample HR (original 9 Hz) – simple forward‑fill
    if downsample_hr:
        df['heart_rate'].replace(-1, np.nan, inplace=True)
        df['heart_rate'].interpolate(method='linear', limit_direction='both', inplace=True)

    # Normalise sensor channels (z‑score per column)
    sensor_cols = df.columns.drop(['timestamp', 'activity_id'])
    df[sensor_cols] = (df[sensor_cols] - df[sensor_cols].mean()) / df[sensor_cols].std()
    return df


### Load & peek at Subject 101

In [4]:
print('First 5 .dat files I see:')
for p in list(RAW_ROOT.rglob('*.dat'))[:5]:
    print(' •', p.relative_to(RAW_ROOT))

First 5 .dat files I see:
 • Protocol/subject108.dat
 • Protocol/subject109.dat
 • Protocol/subject107.dat
 • Protocol/subject106.dat
 • Protocol/subject104.dat


In [5]:
sample_path = next(RAW_ROOT.glob('*Protocol/subject101.dat'))
df101 = load_dat(sample_path)
df101.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['heart_rate'].replace(-1, np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['heart_rate'].interpolate(method='linear', limit_direction='both', inplace=True)


Unnamed: 0,timestamp,activity_id,heart_rate,col_0,col_1,col_2,col_3,col_4,col_5,col_6,...,col_41,col_42,col_43,col_44,col_45,col_46,col_47,col_48,col_49,col_50
0,8.38,0,-0.839332,-1.965332,0.898811,0.446885,0.030369,0.895952,0.471171,-0.064588,...,0.008444,0.014729,-0.01761,-0.401004,-1.138542,-2.322759,,,,
1,8.39,0,-0.839332,-1.965332,0.869927,0.440123,0.075092,0.888978,0.430758,0.020714,...,-0.006492,-0.008282,-0.007561,-0.348879,-1.136321,-2.335681,,,,
2,8.4,0,-0.839332,-1.965332,0.899021,0.446949,0.041749,0.874937,0.428003,0.047642,...,0.003137,-0.000352,0.004828,-0.383407,-1.153368,-2.331433,,,,
3,8.41,0,-0.839332,-1.965332,0.852074,0.432989,0.074625,0.87965,0.427961,0.047633,...,0.003298,-0.034234,-0.001454,-0.370695,-1.132127,-2.331288,,,,
4,8.42,0,-0.839332,-1.965332,0.876366,0.491239,0.086386,0.863274,0.43969,0.05659,...,0.012859,-0.024295,-0.009347,-0.413875,-1.152522,-2.327013,,,,


### Cache all subjects (optional, ~1 GB)

In [6]:
for path in tqdm(list(RAW_ROOT.rglob('*.dat')), desc='Caching'):
    subj = path.stem.split('.')[0]
    out = CACHE_ROOT / f'{subj}.pkl'
    if out.exists(): 
        continue
    load_dat(path).to_pickle(out)
print('Done ✅')


Caching: 100%|██████████| 14/14 [00:00<00:00, 11607.09it/s]

Done ✅





## 2  TS2Vec pre‑training

In [31]:
%pip uninstall -y ts2vec
%pip install git+https://github.com/yuezhihan/ts2vec.git@main


Found existing installation: ts2vec 0.1
Uninstalling ts2vec-0.1:
  Successfully uninstalled ts2vec-0.1
Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/yuezhihan/ts2vec.git@main
  Cloning https://github.com/yuezhihan/ts2vec.git (to revision main) to /private/var/folders/nb/mdcnqz1d7673k0zpfm275hrm0000gp/T/pip-req-build-nmw08e6r
  Running command git clone --filter=blob:none --quiet https://github.com/yuezhihan/ts2vec.git /private/var/folders/nb/mdcnqz1d7673k0zpfm275hrm0000gp/T/pip-req-build-nmw08e6r
  Resolved https://github.com/yuezhihan/ts2vec.git to commit b0088e14a99706c05451316dc6db8d3da9351163
[31mERROR: git+https://github.com/yuezhihan/ts2vec.git@main does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [32]:
%pip install ts2vec

Collecting ts2vec
  Using cached ts2vec-0.1-py3-none-any.whl.metadata (53 bytes)
Using cached ts2vec-0.1-py3-none-any.whl (18 kB)
Installing collected packages: ts2vec
Successfully installed ts2vec-0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
def window_iter(data, win_sec=5, stride_sec=2, fs=100):
    win = win_sec * fs
    stride = stride_sec * fs
    for start in range(0, len(data) - win, stride):
        yield data[start:start+win].values


In [8]:
class PAMAP2Windows(torch.utils.data.Dataset):
    def __init__(self, cache_root=CACHE_ROOT):
        self.paths = list(cache_root.glob('subject*.pkl'))
        self.windows = []
        for p in self.paths:
            # DON’T call .to_numpy() here
            df = pd.read_pickle(p).drop(columns=['timestamp', 'activity_id'])
            self.windows += list(window_iter(df))
    def __len__(self): return len(self.windows)
    def __getitem__(self, idx): 
        return torch.from_numpy(self.windows[idx]).float()


dataset = PAMAP2Windows()
print('Total windows:', len(dataset))


Total windows: 14344


In [9]:
import numpy as np, gc, torch

# ── Build a single 3-D array (may take ~1-2 GB of RAM)
all_windows = np.stack(dataset.windows, axis=0).astype(np.float32)
print('Stacked:', all_windows.shape, f'≈ {all_windows.nbytes/1e9:.2f} GB')

# free the Python list; we only need the big array from here on
del dataset
gc.collect()

Stacked: (14344, 500, 52) ≈ 1.49 GB


0

In [10]:
import torch
torch.backends.mps.is_available()
# → True  ✅
device = 'mps' if torch.backends.mps.is_available() else 'cpu'


In [11]:
device 

'mps'

In [None]:
enc = TS2Vec(
    input_dims=52,
    output_dims=320,
    device=device,
    batch_size=256,   # hyper-params live in the constructor
    lr=1e-3
)

enc.fit(all_windows, n_epochs=20, verbose=True)
enc.save('ts2vec_backbone.pt')


Epoch #0: loss=743.1087387800217
Epoch #1: loss=193.11450375829423
Epoch #2: loss=33.75749032838004
Epoch #3: loss=32981.79550596646
Epoch #4: loss=15748.238873038974


In [None]:
# Project a random batch and inspect shape
batch = torch.from_numpy(all_windows[np.random.choice(len(all_windows), 32)]).float().to(device)
embeddings = enc(batch)
print('Batch shape:', batch.shape, '→ Embeddings shape:', embeddings.shape)



In [None]:
from torch.utils.data import Dataset, DataLoader
import numpy as np, pandas as pd, torch, random
from pathlib import Path

class PAMAP2LabeledWindows(Dataset):
    def __init__(self, raw_root, win_sec=5, stride_sec=2, fs=100):
        self.X, self.y = [], []
        win, stride = win_sec*fs, stride_sec*fs
        for f in Path(raw_root).rglob('subject*.dat'):
            df = load_dat(f)                         # reuse helper
            data = df.drop(columns=['timestamp']).to_numpy(dtype=np.float32)
            acts = df['activity_id'].to_numpy()
            for start in range(0, len(df)-win, stride):
                end = start+win
                self.X.append(data[start:end, 1:])   # sensor channels
                # majority label in the window:
                label = np.bincount(acts[start:end]).argmax()
                self.y.append(label)
        self.X, self.y = np.stack(self.X), np.array(self.y, dtype=np.int64)

    def __len__(self):  return len(self.y)
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), self.y[idx]

labeled_ds = PAMAP2LabeledWindows(RAW_ROOT)
print('Windows:', len(labeled_ds), 'Shape:', labeled_ds[0][0].shape)


In [None]:
enc.eval()                                     # freeze gradients
with torch.no_grad():
    feats = enc.encode(
        torch.from_numpy(labeled_ds.X).to(device)   # (N, L, C)
    )
# TS2Vec returns (N, L, D); take the *last* timestep:
feats = feats[:, -1, :].cpu().numpy()              # (N, D)
labels = labeled_ds.y


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

Xtr, Xte, ytr, yte = train_test_split(
    feats, labels, test_size=0.2, stratify=labels, random_state=42)

clf = LogisticRegression(
    max_iter=500, multi_class='multinomial', solver='lbfgs')
clf.fit(Xtr, ytr)

pred = clf.predict(Xte)
print(classification_report(yte, pred, digits=3))


In [None]:
import seaborn as sns, matplotlib.pyplot as plt
cm = confusion_matrix(yte, pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('PAMAP2 – TS2Vec linear probe')
plt.show()


## 3  Fractal‑SSL fine‑tuning

In [None]:
class FractalViewGenerator:
    """Produces recursively smaller crops of the same window."""
    def __init__(self, levels=4):
        self.levels = levels
    def __call__(self, ts):
        L = ts.shape[0]
        views = []
        for i in range(self.levels):
            frac = 1 / (2 ** i)
            win = int(L * frac)
            start = random.randint(0, L - win)
            views.append(ts[start:start+win])
        return views
gen = FractalViewGenerator()
print([v.shape[0] for v in gen(batch[0])])


In [None]:
class FractalSSL(torch.nn.Module):
    def __init__(self, backbone, proj_dim=128, levels=4, temperature=0.1):
        super().__init__()
        self.backbone     = backbone          # frozen or fine-tuned
        self.levels       = levels
        self.temperature  = temperature

        self.proj = torch.nn.Sequential(
            torch.nn.Linear(backbone.repr_dims, proj_dim),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(proj_dim, proj_dim)
        )

    def forward(self, ts_batch):              # ts_batch: (B, L, C) on device
        # --------------------------------- generate multi-scale views
        view_lists = [FractalViewGenerator(self.levels)(ts) for ts in ts_batch]
        flat_views = [v.float() for sub in view_lists for v in sub]   # all on same device
        lengths    = [v.shape[0] for v in flat_views]

        # pad to (N, max_len, C)  ────────────────────────────────────
        padded = torch.nn.utils.rnn.pad_sequence(flat_views, batch_first=True)
        mask   = torch.arange(padded.size(1), device=padded.device)[None, :] < torch.tensor(lengths, device=padded.device)[:, None]

        # encode → (N, T, D)  ----------------------------------------
        emb = self.backbone.encode(padded)                       # TS2Vec handles mask internally if you pass it; else ignore masked timesteps
        emb = emb.mean(dim=1)                                    # simple temporal pooling

        proj = self.proj(emb)                                    # (N, proj_dim)
        proj = torch.nn.functional.normalize(proj, dim=-1)       # cosine similarity works better

        return proj, view_lists                                  # keep view grouping for the loss


model = FractalSSL(enc).to(device)               # backbone already on device
opt    = torch.optim.AdamW(model.proj.parameters(), lr=1e-3, weight_decay=1e-4)

BATCH  = 64
for step in range(100):
    idx = random.sample(range(len(all_windows)), BATCH)
    ts  = torch.stack([torch.from_numpy(all_windows[i]) for i in idx]).to(device)  # (B, L, C)

    proj, views = model(ts)                    # proj: (B*levels, D)

    # -------- build InfoNCE targets --------------------------------
    n_views  = len(views[0])                   # levels
    targets  = torch.arange(BATCH, device=device).repeat_interleave(n_views)
    # targets[i] = original sample id

    logits   = torch.matmul(proj, proj.T) / model.temperature          # (N,N)
    loss     = torch.nn.CrossEntropyLoss()(logits, targets)

    opt.zero_grad()
    loss.backward()
    opt.step()

    if step % 10 == 0:
        print(f'{step:>3} | loss = {loss.item():.4f}')



In [None]:
torch.save(model.state_dict(), 'fractal_ts2vec.pt')
print('Saved to fractal_ts2vec.pt')


In [None]:


# 1) embed
model.eval()
with torch.no_grad():
    emb = model.backbone.encode(torch.from_numpy(all_windows).to(device))
    emb = emb[:, -1, :].cpu().numpy()

# 2) train/test split + linear probe
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_tr, X_te, y_tr, y_te = train_test_split(emb, labels, stratify=labels, test_size=0.2, random_state=0)
clf = LogisticRegression(max_iter=500, solver='lbfgs', multi_class='multinomial').fit(X_tr, y_tr)

print(classification_report(y_te, clf.predict(X_te), digits=3))
