Script to generate the OOD detector.

In [1]:
import os
import pandas as pd
import pickle
from cleanlab.outlier import OutOfDistribution
import cleanlab
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
import pandas as pd
from torchvision.models import vgg16_bn
from torch.utils.data import DataLoader
import utils.data_utils as data_utils
import utils.train_utils as train_utils
from utils.model_selector import model_selector, free_gpu_memory


Hyperparameters


In [2]:
current_path = "../RESULTS"
batch_size = 4
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using", device)

using cuda:0


Training data can be downloaded from [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15461242.svg)](https://doi.org/10.5281/zenodo.15461242)

In [3]:
os.listdir('../data')

['AV', 'LP', 'NO', 'TC', 'TR', 'VT']

Create a custom Dataset that loads files during training 

In [4]:

class_labels = {0.0: "VT", 1.0: "LP", 2.0: "TR", 3.0: "NO", 4.0: "AV", 5.0: "TC"}
data_dir = []
data_dir_path = "../data"
for folder in os.listdir(data_dir_path):
    print(folder)
    if folder == "VT":
        class_ = 0.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_,filename])
    elif folder == "LP":
        class_ = 1.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_,filename])
    elif folder == "TR":
        class_ = 2.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_, filename])
    elif folder == "NO":
        class_ = 3.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_, filename])
    elif folder == "AV":
        class_ = 4.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_, filename])
    elif folder == "TC":
        class_ = 5.0
        for filename in os.listdir(f"{data_dir_path}/{folder}"):
            path_ = f"{data_dir_path}/{folder}/{filename}"
            data_dir.append([class_, path_, filename])
data_dir_df = pd.DataFrame(data=data_dir, columns=["event_class", "path", "event_name"])

AV
LP
NO
TC
TR
VT


In [5]:
print(data_dir_df.value_counts('event_class'))
print(data_dir_df.head(10))

event_class
1.0    1828
0.0    1502
4.0    1236
2.0    1203
3.0     985
5.0     208
Name: count, dtype: int64
   event_class                     path    event_name
0          4.0     ../data/AV/AV__0.npy     AV__0.npy
1          4.0     ../data/AV/AV__1.npy     AV__1.npy
2          4.0    ../data/AV/AV__10.npy    AV__10.npy
3          4.0   ../data/AV/AV__100.npy   AV__100.npy
4          4.0  ../data/AV/AV__1000.npy  AV__1000.npy
5          4.0  ../data/AV/AV__1001.npy  AV__1001.npy
6          4.0  ../data/AV/AV__1002.npy  AV__1002.npy
7          4.0  ../data/AV/AV__1003.npy  AV__1003.npy
8          4.0  ../data/AV/AV__1004.npy  AV__1004.npy
9          4.0  ../data/AV/AV__1005.npy  AV__1005.npy


Trained models are needed to generate embeddings. Train them first with the model_training notebook or download the weights from: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.15461242.svg)](https://doi.org/10.5281/zenodo.15461242). The script expects the weights to be located in "../RESULTS"

In [6]:
feat_size = {1: 8193, 2: 14337, 3: 14337, 4: 25089} # size of the embedding generated for each of the representations
for rep in [2]:  #[1, 2, 3, 4]:
    weights_path = f"{current_path}/rep{rep}_best_model.pt" 
    print(weights_path)
    # Take only ID data (VT, LP, TR)
    ID_data = data_dir_df.query("event_class<3.0").reset_index()
    ID_dataset = data_utils.CustomImageDataset(ID_data,rep_type=rep)
    ID_dataloader = DataLoader(ID_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    # Load Model
    model_torch = model_selector(f"rep{rep}",with_OOD=False,weights_path=weights_path,OOD_path=None)
    _ = model_torch.eval()
    ID_feats_tensor = torch.empty(1, feat_size[rep])
    for idx, out in enumerate(ID_dataloader):
        X, y, name = out
        feats = model_torch.features(X.to(device))
        flatten_feats = feats.view(feats.shape[0], -1).detach().cpu()
        feats_and_true_lbl = torch.concat([y.view(-1, 1), flatten_feats], axis=1)
        ID_feats_tensor = torch.concat([ID_feats_tensor, feats_and_true_lbl], axis=0)
        if idx%50==0:
            print(idx, "/", len(ID_dataloader))

    ood_KNN = OutOfDistribution()
    train_ood_features_scores = ood_KNN.fit(features=ID_feats_tensor[1:, 1:])
    with open(f'{current_path}/OOD_detector_rep{rep}.pkl', 'wb') as f:
        pickle.dump(ood_KNN, f)
    free_gpu_memory()

../RESULTS/rep2_best_model.pt
Loading weights and OOD detector...
Model weigths successfully loaded...
0 / 1134
50 / 1134
100 / 1134
150 / 1134
200 / 1134
250 / 1134
300 / 1134
350 / 1134
400 / 1134
450 / 1134
500 / 1134
550 / 1134
600 / 1134
650 / 1134
700 / 1134
750 / 1134
800 / 1134
850 / 1134
900 / 1134
950 / 1134
1000 / 1134
1050 / 1134
1100 / 1134
Fitting OOD estimator based on provided features ...
