# Import Packages & Modules

In [14]:
import numpy as np
import pandas as pd
from GPS import GPSclassifier
from sklearn.model_selection import train_test_split, ShuffleSplit

# Read Raw Datasets

In [17]:
trcaData = pd.read_csv("trca1.csv").values  # normal data from the source distribution P
testSet = pd.read_csv("test1.csv").values   # normal data from the target distribution Q
outliSet = pd.read_csv("outli1.csv").values # OOD data from the target distribution Q

In [18]:
# The first col is the label column
trcaData

array([[ 1.00000000e+00, -3.58632564e+00,  3.82818878e-01, ...,
         1.98771852e-01,  4.68971965e-02,  3.49203273e-02],
       [ 1.00000000e+00, -2.24117001e+00, -9.61517544e-01, ...,
         1.08096230e-02, -2.71389503e-03,  9.98701395e-02],
       [ 1.00000000e+00, -3.30252856e+00,  7.73251744e-01, ...,
        -1.96465995e-02,  3.55094842e-02, -1.96924847e-02],
       ...,
       [ 4.00000000e+00,  5.15957951e-01,  4.28542542e+00, ...,
        -6.91182355e-02,  8.22825906e-02, -1.68219368e-02],
       [ 4.00000000e+00,  7.29750839e+00,  6.25966579e+00, ...,
         9.55305636e-02, -1.50968325e-01,  1.24967783e-01],
       [ 4.00000000e+00,  9.71262479e-01,  3.61486814e+00, ...,
         2.30029906e-02,  2.87023927e-03,  7.18026818e-02]])

# Split Datasets

In [19]:
np.random.seed(1234)

K = len(np.unique(trcaData[:, 0]))

# Combine normal and OOD data to form the distribution Q. Here label 0 indicates OOD data
unlabSetAll, unLAll = np.vstack((testSet[:, 1:], outliSet)), np.hstack((testSet[:, 0], [0] * len(outliSet)))

# Randomly split unlabSetAll into 1) the unlabeled data used in model training + calibration, and 2) the test data for model evaluation
inOutrs = ShuffleSplit(n_splits=1, train_size=min(len(unlabSetAll) // 2, len(trcaData) // K))
trcaOutIdx, holdoutIdx = next(inOutrs.split(unLAll))

# Further combine labeled normal data and unlabeled data for model training + calibration.
newtrcaData = np.vstack((trcaData[:, 1:], unlabSetAll[trcaOutIdx]))

# !!! During the training + calibration stage, we abuse label 0 to exclusively distinguish unlabeled data from label data.
# You can use other annotation, say -1, to avoid the label 0 previously used for OOD data (but you must accordingly revise some code, e.g., trainSet[trL == 0] changes to trainSet[trL == -1] in GPS.py line 38)
newtrcaL = np.concatenate([trcaData[:, 0], [0] * len(trcaOutIdx)])

# Split the combined training data into the training set and calibration set
trainSet, calibSet, trL, caL = train_test_split(newtrcaData, newtrcaL, test_size=0.5, stratify=newtrcaL)
calibLabelSet = np.hstack([caL.reshape(-1, 1), calibSet])

# Train GPS

In [28]:
# Set the grids for tuning parameters: C in eq(5), and percentile of pairwise distance to estimate sigma in the Gaussian kernel
candC1 = [0.1, 1, 10]
candSigqtl = [25, 50, 75]

In [29]:
myGPS = GPSclassifier(alpha=0.05, K=K, calibSet=calibLabelSet, candSigqtl=candSigqtl, candC1=candC1)

In [30]:
myGPS.train(trainSet, trL)

start to train class 1
start to train class 2
start to train class 3
start to train class 4


# GPS Evaluation

In [31]:
unlabelSet_hd, lb_hd = unlabSetAll[holdoutIdx], unLAll[holdoutIdx]

In [32]:
# Prediction set for test points
pred_set = myGPS.test(unlabelSet_hd) >= 0

In [33]:
# Prediction set size for test points
pred_set_size = pred_set.sum(1)
pred_set_size

array([2, 1, 2, ..., 1, 1, 2])

In [34]:
# Class-specific accuracy
class_acc = [pred_set[lb_hd == k, k - 1].mean() for k in range(1, 1 + K)]
class_acc

[0.9724770642201835, 1.0, 0.986784140969163, 0.981042654028436]

In [35]:
# OOD detection rate
det_rate = np.mean(pred_set[lb_hd == 0].sum(1) == 0)
det_rate


0.9819494584837545