# Standalone Example Notebook

This notebook is the example notebook but with code copied directly rather than imported.

In this standalone notebook we demonstrate CompFS on the Syn1 experiment from the paper. This can be used on custom data if it is written as numpy arrays.

This notebook is standalone because we have copied over all code one would need to run CompFS, in some cases it has been slightly adapted, changes do not affect the model, such as changing a function that saves results to simply printing them.

In [None]:
from functools import reduce

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Set and print device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Code Copied Over

This code has been copied over and slightly adapted to work in a standalone notebook. The underlying method is exactly the same.

## Metrics

Accuracy, MSE and GSim metrics.

In [None]:
def accuracy(x, y):
    # Accuracy.
    acc = 100 * torch.sum(torch.argmax(x, dim=-1) == y) / len(y)
    return acc.item()


def mse(x, y):
    # MSE for regression.
    return 0.5 * torch.mean((x - y) ** 2).item()

In [None]:
def gsim(true_groups, predicted_groups):
    # Returns gsim, number of true groups, and number of discovered groups, given
    # true groups and predicted groups as input.
    gsim = 0
    if len(true_groups) == 0:  # i.e. we don't know the ground truth.
        return -1, len(true_groups), len(predicted_groups)
    if len(predicted_groups) > 0:
        for g in true_groups:
            current_max = 0
            for g_hat in predicted_groups:
                jac = np.intersect1d(g, g_hat).size / np.union1d(g, g_hat).size
                if jac == 1:
                    current_max = 1
                    break
                if jac > current_max:
                    current_max = jac
            gsim += current_max
        gsim /= max(len(true_groups), len(predicted_groups))
        return gsim, len(true_groups), len(predicted_groups)
    else:  # We didn't find anything.
        return 0, len(true_groups), len(predicted_groups)


def tpr_fdr(true_groups, predicted_groups):
    # True positive rate and false discovery rate.

    if len(true_groups) == 0:  # Ground truth not known.
        return -1, -1

    if len(predicted_groups) == 0:
        return 0.0, 0.0

    predicted_features = np.unique(reduce(np.union1d, predicted_groups))
    true_features = np.unique(reduce(np.union1d, true_groups))

    overlap = np.intersect1d(predicted_features, true_features).size
    tpr = 100 * overlap / len(true_features)
    fdr = (
        100 * (len(predicted_features) - overlap) / len(predicted_features)
    )  # If len(predicted_features) != 0 else 0.0.
    return tpr, fdr

## Thresholding Functions

Different thresholding functions for determining feature relevance from feature scores provided by CompFS.

In [None]:
def make_lambda_threshold(l):
    # If the value is above a certain value l (lambda) return 1, otherwise 0.
    l = float(l)

    def l_func(p):
        return p >= torch.full_like(p, l)

    return l_func


def make_std_threshold(nsigma):
    # Choose which features are relevant in p relative to other features,
    # if value of feature is above mean + n standard deviations.
    nsigma = float(nsigma)

    def std_dev_func(p):
        mean = torch.mean(p)
        std = torch.std(p)
        return p >= torch.full_like(p, (mean + nsigma * std).item())

    return std_dev_func


def make_top_k_threshold(k):
    # Choose top k features.
    k = int(k)

    def top_k(p):
        ids = torch.topk(p, k)[1]
        out = torch.zeros_like(p)
        out[ids] = 1.0
        return out.int()

    return top_k

## CompFS Model

The CompFS torch module and a shell for handling the training.

In [None]:
def is_array_in_list(arr, arr_list):
    """Checks if a trial array is in a list of arrays."""
    for element in arr_list:
        if np.array_equal(element, arr):
            return True
    return False


class FullyConnected(nn.Module):
    """
    Two hidden layer ReLU MLP, goes to hidden representation ONLY.

    Args:
        in_dim: the number of features
        h_dim: hidden width
    """

    def __init__(self, in_dim, h_dim):
        super(FullyConnected, self).__init__()
        self.fc1 = nn.Linear(in_dim, h_dim)
        self.fc2 = nn.Linear(h_dim, h_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x


class Gate(nn.Module):
    """
    Gate used in the CompFS individual feature selectors.

    Has weights w, and then apply a sigmoid to get p.
    When training, sample from Bernoulli with parameters p, using relaxed Bernoulli
    to get m.
    When testing we apply a thresholding function of choice to p and a step function
    to get m.
    During training and testing the output of the gate is given by:
    gate(x) = m*x + (1-m)*x_bar,
    where x_bar is the feature-wise mean of the input.

    Args:
        in_dim: number of features
        threshold_func: function which turns p into m
        temp: the "temperature"/sharpness of the reparametereised Bernoulli sampling
    """

    def __init__(self, in_dim, threshold_func, temp):
        super(Gate, self).__init__()
        self.w = nn.Parameter(torch.normal(torch.zeros(in_dim), torch.ones(in_dim)))
        self.threshold_func = threshold_func
        self.temp = temp

    def forward(self, x, x_bar=0, test=False):
        if test:
            m = self.make_m()
            m = m.repeat(
                len(x), 1
            ).float()  # Repeat to make it the same size given the batch.
        else:
            p = torch.sigmoid(self.w).repeat(
                len(x), 1
            )  # Repeat to make it the same size given the batch.
            u = torch.rand(p.shape).to(p.device)
            # Reparameterization trick for Bernoulli.
            m = torch.sigmoid(
                (torch.log(p) - torch.log(1 - p) + torch.log(u) - torch.log(1 - u))
                / self.temp
            )
        return m * x + (1 - m) * x_bar

    def make_m(self):
        return self.threshold_func(torch.sigmoid(self.w))


class SingleFeatureSelector(nn.Module):
    """
    An feature selector based on stochastic gates, given by mlp and Bernoulli gate.
    https://arxiv.org/abs/1810.04247

    Args:
        in_dim: number of features
        h_dim: hidden width of learner
        out_dim: the dimenion of the output
        threshold: threshold for gate
        temp: temperature of Bernoulli reparameterisation
    """

    def __init__(self, in_dim, h_dim, out_dim, threshold_func, temp):
        super(SingleFeatureSelector, self).__init__()
        self.to_hidden = FullyConnected(in_dim, h_dim)
        self.gate = Gate(in_dim, threshold_func, temp)
        self.fc_individual = nn.Linear(h_dim, out_dim)
        self.fc_aggregate = nn.Linear(h_dim, out_dim)

    def forward(self, x, x_bar=0, test=False):
        return self.to_hidden(self.gate(x, x_bar, test))

    def predict(self, x, x_bar):
        return self.fc_individual(self.forward(x, x_bar, test=True))

    def count_features(self):
        # Count how many features there are in this learner.
        return torch.sum(self.gate.make_m()).item()

    def get_group(self):
        # Give the features that this learner uses.
        return torch.where(self.gate.make_m())[0]

    def get_importance(self):
        # Frobenius norm of final weight matrix, to compare to other learners.
        return torch.sqrt(torch.sum(self.fc_aggregate.weight**2)).item()


class CompFS(nn.Module):
    """
    The CompFS model.

    Has a set of weak learners, and given each p vector we punish them overlapping, i.e. p_i dot p_j
    and also having lots of features torch.sum(p)**2. We can control how much with beta_s (small groups)
    and beta_d (different groups).

    Args (in a config_dict):
        nlearners: how many groups we want
        in_dim: dimension of problem
        h_dim: hidden width of mlps
        out_dim: dimension of output
        threshold: function to determine a feature is included
        temp: temperature of the Bernoulli reparameterisation
    """

    def __init__(self, config_dict):
        super(CompFS, self).__init__()
        self.beta_s = config_dict["beta_s"]
        self.beta_s_decay = config_dict["beta_s_decay"]
        self.beta_d = config_dict["beta_d"]
        self.beta_d_decay = config_dict["beta_d_decay"]
        self.loss_func = config_dict["loss_func"]
        self.x_bar = 0
        self.nfeatures = config_dict["in_dim"]
        self.nlearners = config_dict["nlearners"]
        h_dim = config_dict["h_dim"]
        out_dim = config_dict["out_dim"]
        threshold_func = config_dict["threshold_func"]
        temp = config_dict["temp"]
        self.learners = nn.ModuleList(
            [
                SingleFeatureSelector(
                    self.nfeatures, h_dim, out_dim, threshold_func, temp
                )
                for _ in range(self.nlearners)
            ]
        )

    def forward(self, x):
        x_b = self.x_bar.repeat(len(x), 1).to(x.device)
        total = 0
        individuals = torch.tensor([]).to(x.device)
        for l in self.learners:
            hidden = l(x, x_b).unsqueeze(0)
            total += l.fc_aggregate(hidden)
            individuals = torch.cat(
                [individuals, l.fc_individual(hidden.detach())], dim=0
            )
        out = torch.cat(
            [total, individuals], dim=0
        )  # We want to train the ensemble, and the individual learners together.
        return out

    def predict(self, x):
        # Test the ensemble.
        x_b = self.x_bar.repeat(len(x), 1).to(x.device)
        out = 0
        for l in self.learners:
            out += l.fc_aggregate(l(x, x_b, test=True))
        return out

    def preprocess(self, data):
        return data

    def get_loss(self, x, y):
        output = self.forward(x)
        loss = self.loss_func(output[0], y)
        for i in range(self.nlearners):
            loss += self.loss_func(output[i + 1], y)
            pi_i = torch.sigmoid(self.learners[i].gate.w)
            # Multiply by square root of number of features. So we punish more features, but not as quickly as linearly.
            loss += (
                self.beta_s
                * (torch.mean(pi_i) ** 2)
                * (self.nfeatures**0.5)
                / (self.nlearners)
            )
            for j in range(i + 1, self.nlearners):
                pi_j = torch.sigmoid(self.learners[j].gate.w)
                loss += (
                    2
                    * self.beta_d
                    * torch.mean(pi_i * pi_j)
                    * (self.nfeatures**0.5)
                    / (self.nlearners * (self.nlearners - 1))
                )
        return loss

    def update_after_epoch(self):
        self.beta_d *= self.beta_d_decay
        self.beta_s *= self.beta_s_decay

    def count_features(self):
        # Return list of number of features in each group.
        out = []
        for l in self.learners:
            out.append(l.count_features())
        return out

    def get_overlap(self):
        # Count how many features overlap, and where they are.
        overlap = 0
        for l in self.learners:
            overlap += l.gate.make_m()
        overlap = overlap > 1
        noverlap = torch.sum(overlap).item()
        ids = torch.where(overlap)
        return noverlap, ids

    def get_groups(self):
        # Return a list of the groups as numpy arrays, which are not empty and unique.
        groups = []
        for l in self.learners:
            g = l.get_group().detach().cpu().numpy()
            if (len(g) != 0) and (not is_array_in_list(g, groups)):
                groups.append(g)
        return groups

    def set_threshold_func(self, new_func):
        # After training we can change how we threshold the scores of each learner. By giving
        # the ensemble a new thresholding function.
        for l in self.learners:
            l.gate.threshold_func = new_func

    def print_evaluation_info(self, x, y, val_metric):
        output = self.predict(x)
        full_model_performance = val_metric(output, y)
        print(
            "\n\nPerformance:\nFull Model Test Metric: {:.3f}".format(
                full_model_performance
            )
        )

        # print individual accuracies if using compfs
        for i in range(self.nlearners):
            output = self.learners[i].predict(
                x, self.x_bar.repeat(len(x), 1).to(x.device)
            )
            individual_performance = val_metric(output, y)
            print(
                "Group: {}, Test Metric: {:.3f}".format(i + 1, individual_performance)
            )

        # print importances if using compfs
        print("\n\nImportances:")
        for i in range(self.nlearners):
            individual_importance = self.learners[i].get_importance()
            print("Group: {}, Importance: {:.3f}".format(i + 1, individual_importance))

In [None]:
class CompFSShell:
    def __init__(self, model_config):
        self.device = device
        self.model = CompFS(model_config).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=model_config["lr"])
        self.num_epochs = model_config["num_epochs"]
        self.lr_decay = model_config["lr_decay"]
        self.batchsize = model_config["batchsize"]
        self.val_metric = model_config["val_metric"]
        super().__init__()

    def train(self, train_data, val_data):
        self.model.x_bar = train_data.get_x_bar()
        train_data = self.model.preprocess(train_data)
        val_data = self.model.preprocess(val_data)
        batch_size = len(train_data) if self.batchsize == 0 else self.batchsize
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=len(val_data), shuffle=False)
        print("\n\nTraining for {} Epochs:\n".format(self.num_epochs))

        for epoch in range(1, self.num_epochs + 1):
            # Train an epoch.
            epoch_loss = self.train_epoch(train_loader)

            # Evaluate the model and save values.
            val = self.calculate_val_metric(val_loader)
            nfeatures = self.model.count_features()
            overlap = self.model.get_overlap()[0]

            # Print information.
            print(
                "Epoch: {}, Average Loss: {:.3f}, Val Metric: {:.1f}, nfeatures: {}, Overlap: {}".format(
                    epoch, epoch_loss, val, nfeatures, overlap
                )
            )

            # Update learning rate.
            for g in self.optimizer.param_groups:
                g["lr"] *= self.lr_decay

    def train_epoch(self, train_loader):
        avg_loss = 0
        for x, y in train_loader:
            x = x.view(x.shape[0], -1)  # Flatten to vectors.
            x = x.to(self.device)
            y = y.to(self.device)
            self.optimizer.zero_grad()
            loss = self.model.get_loss(x, y)
            loss.backward()
            self.optimizer.step()
            avg_loss += loss.item()
        self.model.update_after_epoch()
        return avg_loss / len(train_loader)

    def calculate_val_metric(self, val_loader):
        metric = 0
        for x, y in val_loader:
            x = x.view(x.shape[0], -1)  # Flatten to vectors.
            x = x.to(self.device)
            y = y.to(self.device)
            out = self.model.predict(x)
            metric += self.val_metric(out, y)
        return metric / len(val_loader)

    def get_groups(self):
        return self.model.get_groups()

    def print_evaluation_info(self, val_data):
        val_loader = DataLoader(val_data, batch_size=len(val_data), shuffle=False)
        for x, y in val_loader:
            x = x.view(x.shape[0], -1)  # flatten the vectors
            x = x.to(self.device)
            y = y.to(self.device)
        self.model.print_evaluation_info(x, y, self.val_metric)

## Dataset Class

This class takes NumPy X and y data and converts it into a dataset that works with CompFS. This can be used on your own data.

In [None]:
class NumpyDataSet(Dataset):
    def __init__(self, X_data, y_data, classification=True):

        self.x_bar = torch.tensor(np.mean(X_data, axis=0)).float()
        self.num_data = X_data.shape[0]
        self.data = []
        for x_sample, y_sample in zip(X_data, y_data):
            x = torch.from_numpy(x_sample).float()
            if classification:
                y = torch.tensor(y_sample).long()
            else:
                y = torch.tensor(y_sample).float()
            self.data.append((x, y))

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.num_data

    def get_x_bar(self):
        try:
            return self.x_bar
        except AttributeError:
            x_bar = 0
            for sample in self.data:
                x_bar += sample[0]
            self.x_bar = x_bar / self.num_data
            return self.x_bar

# Example

Here we demonstrate CompFS on Syn1. The two cells below can be edited to run your own data.

In [None]:
# These can be changed to run your own data.

X_train = np.random.normal(size=(20000, 500))
y_train = np.array([((x[0] > 0.55) or (x[1] > 0.55)) for x in X_train])
X_val = np.random.normal(size=(200, 500))
y_val = np.array([((x[0] > 0.55) or (x[1] > 0.55)) for x in X_val])

classification = True

ground_truth_groups = [np.array([0]), np.array([1])]

In [None]:
# This config should be changed to use your own data, and find specific
# hyperparameters for the problem.

compfs_config = {
    "lr": 0.003,
    "lr_decay": 0.99,
    "batchsize": 50,
    "num_epochs": 10,
    "loss_func": nn.CrossEntropyLoss(),
    "val_metric": accuracy,
    "in_dim": 500,
    "h_dim": 20,
    "out_dim": 2,
    "nlearners": 5,
    "threshold_func": make_lambda_threshold(0.7),
    "temp": 0.1,
    "beta_s": 4.5,
    "beta_s_decay": 0.99,
    "beta_d": 1.2,
    "beta_d_decay": 0.99,
}

In [None]:
train_data = NumpyDataSet(X_train, y_train, classification=classification)
val_data = NumpyDataSet(X_val, y_val, classification=classification)

In [None]:
model = CompFSShell(compfs_config)

In [None]:
model.train(train_data, val_data)

In [None]:
model.print_evaluation_info(val_data)

In [None]:
# Get group similarity and group structure.
tpr, fdr = tpr_fdr(ground_truth_groups, model.get_groups())
group_sim, ntrue, npredicted = gsim(ground_truth_groups, model.get_groups())

print("\n\nGroup Structure:")
print(
    "Group Similarity: {:.3f}, True Positive Rate: {:.3f}%, False Discovery Rate: {:.3f}%".format(
        group_sim, tpr, fdr
    )
)
print(
    "Number of True Groups: {}, Number of Predicted Groups: {}".format(
        ntrue, npredicted
    )
)

# Give selected features and save the groups.
print("\n\nSelected Features:")
learnt_groups = model.get_groups()
for i in range(len(learnt_groups)):
    print("Group: {}, Features: {}".format(i + 1, learnt_groups[i]))

We see that the model finds the features, usually separating features 0 and 1, occasionally grouping them together.