In [17]:
from pathlib import Path
import pickle

import numpy as np

from ruben import PersistenceDiagram
from main import OUTDIR, DATA_PATH

In [39]:
from gtda.diagrams import PersistenceEntropy, ComplexPolynomial

def features_per_dim(b, d, q):
	if q[0] == 0:
		d[-1] = 1
	with np.errstate(invalid='ignore', divide='ignore'):
		return np.hstack([
			mean_bd := np.c_[b, d].mean(axis=0),
			mean_bd**2,
			np.nan_to_num(1/mean_bd + np.log(mean_bd)), # fix divide by zero
			np.c_[b, d].std(axis=0),
			np.mean(b - d),
			np.mean(b - d)**2,
			np.mean((b + d) / 2),
			np.mean((b + d) / 2)**2,
			PersistenceEntropy().fit_transform([np.c_[b, d, q]])[0],
			ComplexPolynomial().fit_transform([np.c_[b, d, q]])[0],
			# should i include the other stuff
			np.pad(np.sort(d - b)[:-11:-1], (0, max(0, 10 - len(b))))
		])

def features(pd, max_dim):
	return np.hstack([features_per_dim(*pd[pd[:,2]==q].T) for q in range(max_dim + 1)])

In [46]:
paths = [*OUTDIR.glob('./task1/pds/importance/*.bin')]

In [47]:
from tqdm import tqdm

def extract_features(path: Path):
	with open(path, 'rb') as f:
		obj: PersistenceDiagram = pickle.load(f)
	pd = np.array([[p.birth, p.death, p.dim] for p in obj.points])
	return features(pd, 1)

X = np.array(list(map(extract_features, tqdm(paths))))

100%|██████████| 96/96 [00:29<00:00,  3.31it/s]


In [48]:
import json

model_names = [p.stem.partition('_')[-1] for p in paths]
with open(DATA_PATH / '../../reference_data/task1_v4/model_configs.json') as f:
	config = json.load(f)

y = [config[name]['metrics']['train_acc'] - config[name]['metrics']['test_acc'] for name in model_names]

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(
	X, y, train_size=0.7, random_state=20)

Ridge(alpha=0.5).fit(X_train, y_train).score(X_test, y_test)



0.023784005473514114