# Group Lasso Notebook

To use this notebook, put the appropriate Chemistry Data in a folder in Google Drive, after mounting the drive it is crucial to select the correct folder, which is the first line of this notebook:

foldername = 'gdrive/MyDrive/'

Edit this to have the correct foldername.

Also install group-lasso with the second line of this notebook.

After that it is fairly easy to run. Hyperparameters can be changed, and in the Run Experiment part of the notebook one can run a synthetic or chemistry experiment easily with:

run_syn(rule=4, experiment_no=1)

or

run_chem(rule=2, experiment_no=1)

In [None]:
foldername = 'gdrive/MyDrive/'

In [67]:
!pip install group-lasso

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [88]:
from scipy.special import comb
import numpy as np
import itertools
from functools import reduce

import os
import os.path as osp

from group_lasso import LogisticGroupLasso
LogisticGroupLasso.LOG_LOSSES = True

In [79]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


# Change the hyperparameters here:

In [132]:
max_group_size = 2        # Max group size, to prevent the problem getting exponenetially large.
n_iterations_syn = 1000   # Max number of iterations on syn before Group Lasso Stops.
n_iterations_chem = 100   # Max number of iterations on chem before Group Lasso Stops.
eps = 0.005               # If a group has weight ||w||_2 > eps it is selected.

# Functions for Running

In [119]:
def gsim(true_groups, predicted_groups):
    # Returns gsim, number of true groups, and number of discovered groups, given
    # true groups and predicted groups as input.
    gsim = 0
    if len(true_groups) == 0: # i.e. we don't know the ground truth.
       return -1, len(true_groups), len(predicted_groups)
    if len(predicted_groups)>0:
      for g in true_groups:
         current_max = 0
         for g_hat in predicted_groups:
            jac = np.intersect1d(g, g_hat).size / np.union1d(g, g_hat).size
            if jac == 1:
               current_max = 1
               break
            if jac > current_max:
               current_max = jac
         gsim += current_max
      gsim /= max(len(true_groups), len(predicted_groups))
      return gsim, len(true_groups), len(predicted_groups)
    else:   # We didn't find anything.
      return 0, len(true_groups), len(predicted_groups)


def tpr_fdr(true_groups, predicted_groups):
   # True positive rate and false discovery rate.
   
   if len(true_groups) == 0:  # Ground truth not known.
      return -1, -1

   if len(predicted_groups) == 0:
      return 0.0, 0.0

   predicted_features = np.unique(reduce(np.union1d, predicted_groups))
   true_features = np.unique(reduce(np.union1d, true_groups))

   overlap = np.intersect1d(predicted_features, true_features).size
   tpr = 100*overlap/len(true_features)
   fdr = 100*(len(predicted_features)-overlap)/len(predicted_features) # If len(predicted_features) != 0 else 0.0.
   return tpr, fdr

In [120]:
# Logic Rules

def rule1(x_sample):
    return (x_sample[0] > 0.55) or (x_sample[1] > 0.55)

def rule2(x_sample):
    return (x_sample[0]*x_sample[1] > 0.30) or (x_sample[2]*x_sample[3] > 0.30)

def rule3(x_sample):
    return (x_sample[0]*x_sample[1] > 0.30) or (x_sample[0]*x_sample[2] > 0.30)

def rule4(x_sample):
    return (x_sample[0]*x_sample[3] > 0.30) or (x_sample[6]*x_sample[9] > 0.30)

In [121]:
# Sampling rules

def normal_sample(nsamples, nfeatures):
    return np.random.normal(size=(nsamples, nfeatures))

def correlated_sample(nsamples, nfeatures):
    mean = np.array([0.0, 0.0, 0.0])
    cov = np.array([[1, 0.99, 0.99],
                    [0.99, 1, 0.99],
                    [0.99, 0.99, 1]])

    x123 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x456 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x789 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x101112 = np.random.multivariate_normal(mean, cov, size=nsamples)
    xrest = np.random.normal(size=(nsamples, nfeatures-4*3))
    return np.concatenate([x123, x456, x789, x101112, xrest], axis=1)

In [122]:
def run_base_experiment(X_train, y_train, X_test, y_test, true_groups, reg_param, n_iterations):

    # Construct groups - duplicate features.
    X_group_lasso_train = np.array([])
    X_group_lasso_test = np.array([])
    groups = []
    groups_tuple = []
    g_num = 0

    print('Constructing Groups')
    for n in range(1, max_group_size+1):
        for comb in itertools.combinations(list(range(X_train.shape[1])), n):
            X_group_lasso_train = np.concatenate([X_group_lasso_train, X_train[:,comb]], axis=1) if X_group_lasso_train.size else np.array(X_train[:,comb])
            X_group_lasso_test = np.concatenate([X_group_lasso_test, X_test[:,comb]], axis=1) if X_group_lasso_test.size else np.array(X_test[:,comb])
            groups.extend([g_num]*len(comb))
            g_num+=1
            groups_tuple += [comb]
    print('Constructing Groups complete')

    print('X_Group_Train shape: {}'.format(X_group_lasso_train.shape))
    print('y_Train shape: {}'.format(y_train.shape))
    print('X_Group_Test shape: {}'.format(X_group_lasso_test.shape))
    print('y_Test shape: {}'.format(y_test.shape))
    print('Number of possible groups: {}\n'.format(len(groups_tuple)))

    gl = LogisticGroupLasso(
        groups=groups,
        n_iter=n_iterations,
        group_reg=reg_param,
        l1_reg=0.0,
        scale_reg='group_size',
        supress_warning=True,
    )

    print('Training started')
    gl.fit(X_group_lasso_train, y_train)
    print('Training complete\n')


    # Extract info from estimator
    pred = gl.predict(X_group_lasso_test)

    # Compute performance metrics
    accuracy = (pred == y_test).mean()

    # get groups
    w_hat = gl.coef_
    w_hat = w_hat[:, 1] - w_hat[:, 0]
    group_sizes = np.zeros(groups[-1]+1)

    for f, w in zip(groups, w_hat):
        group_sizes[f] += w**2
    group_sizes = group_sizes**0.5

    chosen_groups = np.where(group_sizes>eps)[0]
    check_acc = 100*(((X_group_lasso_train @ w_hat)>0)==y_train).mean()

    # Print results.
    print('Results:\n')
    print(f"Test Accuracy: {accuracy}")
    print(f"Check Test Accuracy is Close to Train Accuracy: {check_acc}")
    print('Chosen Groups:')
    for g in chosen_groups:
        print('     Group: {}, {}'.format(g, groups_tuple[g]))

    discovered_groups = [np.array(list(groups_tuple[g])) for g in chosen_groups]

    tpr, fdr = tpr_fdr(true_groups, discovered_groups)
    group_sim, ntrue, npredicted = gsim(true_groups, discovered_groups)

    print('\nGroup Similarity: {:.3f}\nTrue Positive Rate: {:.3f}%\nFalse Discovery Rate: {:.3f}%'.format(group_sim, tpr, fdr))
    print('Number of True Groups: {}\nNumber of Predicted Groups: {}'.format(ntrue, npredicted))

In [123]:
def run_syn(rule, experiment_no):

    print('Running Syn{}'.format(rule))
    n_iterations = n_iterations_syn
    gauss_gt_groups = {
        1: [np.array([0]), np.array([1])], 
        2: [np.array([0, 1]), np.array([2, 3])],
        3: [np.array([0, 1]), np.array([0, 2])],
        4: [np.array([0, 3]), np.array([6, 9])]
    }

    group_regs = {
        1: 0.1,
        2: 0.04,
        3: 0.04,
        4: 0.04
    }

    sampling_rules = {
        1: normal_sample,
        2: normal_sample,
        3: normal_sample,
        4: correlated_sample
    }

    logic_rules = {
        1: rule1,
        2: rule2,
        3: rule3,
        4: rule4
    }

    np.random.seed(experiment_no)
    X_train = sampling_rules[rule](nsamples=1000, nfeatures=60)
    y_train = np.array([logic_rules[rule](x) for x in X_train])
    X_test = sampling_rules[rule](nsamples=100, nfeatures=60)
    y_test = np.array([logic_rules[rule](x) for x in X_test])

    true_groups = gauss_gt_groups[rule]
    reg_param = group_regs[rule]
    run_base_experiment(X_train, y_train, X_test, y_test, true_groups, reg_param)

In [124]:
def run_chem(rule, experiment_no):

    print('Running Chem{}'.format(rule))
    n_iterations = n_iterations_chem
    np.random.seed(experiment_no)

    rules = {
        1: 4,
        2: 10,
        3: 13
    }
    rule = rules[rule]

    chem_data_groups = {4: [np.array([40]), np.array([1])], # logic_4 = ether OR NOT alkyne
		            10: [np.array([56, 18]), np.array([40])], # logic_10 = (primary amine AND NOT ether) OR (NOT benzene AND NOT ether)
		            13: [np.array([18, 29]), np.array([1, 40])]} # logic_13 = (benzene AND NOT carbonyl) OR (alkyne AND NOT ether)

    group_regs = {
        4: 0.04,
        10: 0.04,
        13: 0.04
    }

    
    X_train = np.load(osp.join(foldername, 'logic_'+str(rule)+'_X_train.npy'))
    y_train = np.load(osp.join(foldername, 'logic_'+str(rule)+'_Y_train.npy'))

    X_test = np.load(osp.join(foldername, 'logic_'+str(rule)+'_X_test.npy'))
    y_test = np.load(osp.join(foldername, 'logic_'+str(rule)+'_Y_test.npy'))

    true_groups = chem_data_groups[rule]
    reg_param = group_regs[rule]
    run_base_experiment(X_train, y_train, X_test, y_test, true_groups, reg_param, n_iterations)


# Run Experiment

In [133]:
run_syn(rule=1, experiment_no=1)

Running Syn1
Constructing Groups
Constructing Groups complete
X_Group_Train shape: (1000, 3600)
y_Train shape: (1000,)
X_Group_Test shape: (100, 3600)
y_Test shape: (100,)
Number of possible groups: 1830

Training started
Training complete

Results:

Test Accuracy: 0.79
Check Test Accuracy is Close to Train Accuracy: 80.9
Chosen Groups:
     Group: 0, (0,)
     Group: 1, (1,)
     Group: 60, (0, 1)

Group Similarity: 0.667
True Positive Rate: 100.000%
False Discovery Rate: 0.000%
Number of True Groups: 2
Number of Predicted Groups: 3


You used subsampling then this is expected, otherwise, try increasing the number of iterations or decreasing the tolerance.


In [134]:
run_chem(rule=1, experiment_no=1)

Running Chem1
Constructing Groups
Constructing Groups complete
X_Group_Train shape: (3861, 7056)
y_Train shape: (3861,)
X_Group_Test shape: (466, 7056)
y_Test shape: (466,)
Number of possible groups: 3570

Training started
Training complete

Results:

Test Accuracy: 1.0
Check Test Accuracy is Close to Train Accuracy: 75.26547526547527
Chosen Groups:
     Group: 1, (1,)
     Group: 40, (40,)
     Group: 205, (1, 40)

Group Similarity: 0.667
True Positive Rate: 100.000%
False Discovery Rate: 0.000%
Number of True Groups: 2
Number of Predicted Groups: 3


You used subsampling then this is expected, otherwise, try increasing the number of iterations or decreasing the tolerance.
