# CAE Notebook

To use this notebook, put the appropriate Chemistry Data in a folder in Google Drive, after mounting the drive it is crucial to select the correct folder, which is the first line of this notebook:

foldername = 'gdrive/MyDrive/'

Edit this to have the correct foldername.

We also need to install STG, with pip install, then restart the runtime.

After that it is fairly easy to run. Hyperparameters can be changed, and in the Run Experiment part of the notebook one can run a synthetic or chemistry experiment easily by commenting out which experiment to run later.

In [1]:
foldername = 'gdrive/MyDrive/

In [2]:
!pip install --user stg 
# If you are running this notebook on Google Colab, please reset the current python environment via 'Runtime -> Restart runtime' after installation.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stg
  Downloading stg-0.1.2.tar.gz (14 kB)
Collecting lifelines
  Downloading lifelines-0.27.3-py3-none-any.whl (349 kB)
[K     |████████████████████████████████| 349 kB 7.3 MB/s 
Collecting autograd-gamma>=0.3
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
Collecting formulaic>=0.2.2
  Downloading formulaic-0.5.2-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.3 MB/s 
Collecting graphlib-backport>=1.0.0
  Downloading graphlib_backport-1.0.3-py3-none-any.whl (5.1 kB)
Collecting typing-extensions>=4.2.0
  Downloading typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Collecting interface-meta>=1.2.0
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: stg, autograd-gamma
  Building wheel for stg (setup.py) ... [?25l[?25hdone
  Created wheel for stg: filename=stg-0.1.2-py3-none-any.whl size=15522 sha2

In [38]:
from stg import STG
import numpy as np
import scipy.stats # for creating a simple dataset 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import torch
from functools import reduce

import os
import os.path as osp

In [4]:
def set_seed(x):
    # set a consistent seed, so we can run across different runs
    x *= 10000
    np.random.seed(x)
    torch.manual_seed(x)
    torch.cuda.manual_seed(x)
    torch.cuda.manual_seed_all(x)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


#Syn Data Specific

In [5]:
# Logic Rules.

def rule1(x_sample):
    return (x_sample[0] > 0.55) or (x_sample[1] > 0.55)

def rule2(x_sample):
    return (x_sample[0]*x_sample[1] > 0.30) or (x_sample[2]*x_sample[3] > 0.30)

def rule3(x_sample):
    return (x_sample[0]*x_sample[1] > 0.30) or (x_sample[0]*x_sample[2] > 0.30)

def rule4(x_sample):
    return (x_sample[0]*x_sample[3] > 0.30) or (x_sample[6]*x_sample[9] > 0.30)

# Sampling rules

def normal_sample(nsamples, nfeatures):
    return np.random.normal(size=(nsamples, nfeatures))

def correlated_sample(nsamples, nfeatures):
    mean = np.array([0.0, 0.0, 0.0])
    cov = np.array([[1, 0.99, 0.99],
                    [0.99, 1, 0.99],
                    [0.99, 0.99, 1]])

    x123 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x456 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x789 = np.random.multivariate_normal(mean, cov, size=nsamples)
    x101112 = np.random.multivariate_normal(mean, cov, size=nsamples)
    xrest = np.random.normal(size=(nsamples, nfeatures-4*3))
    return np.concatenate([x123, x456, x789, x101112, xrest], axis=1)

sampling_rules = {
        1: normal_sample,
        2: normal_sample,
        3: normal_sample,
        4: correlated_sample
    }

logic_rules = {
    1: rule1,
    2: rule2,
    3: rule3,
    4: rule4
}

gauss_groups = {1: [np.array([0]), np.array([1])], 2: [np.array([0, 1]), np.array([2, 3])],
                3: [np.array([0, 1]), np.array([0, 2])], 4: [np.array([0, 3]), np.array([6, 9])]}
gauss_oracle_features = {1: np.array([0, 1]), 2: np.array([0, 1, 2, 3]),
                         3: np.array([0, 1, 2]), 4: np.array([0, 3, 6, 9])}



def make_syn_data(rule, nsamples, nfeatures, train):
    x_data = sampling_rules[rule](nsamples, nfeatures)
    y_data = np.array([logic_rules[rule](x) for x in x_data])
    return x_data, y_data

# Chem Data Specific

In [6]:
from google.colab import drive
drive.mount('gdrive')


chem_data_groups = {4: [np.array([40]), np.array([1])], # logic_4 = ether OR NOT alkyne
		            10: [np.array([56, 18]), np.array([40])], # logic_10 = (primary amine AND NOT ether) OR (NOT benzene AND NOT ether)
		            13: [np.array([18, 29]), np.array([1, 40])], # logic_13 = (benzene AND NOT carbonyl) OR (alkyne AND NOT ether)
                    }
chem_oracle_features = {4: np.array([1, 40]),
                        10: np.array([18, 40, 56]),
                        13: np.array([1, 18, 29, 40])}


def make_chem_data(rule, train=True):
    is_train = 'train' if train else 'test'
    x_data = np.load(osp.join(foldername, 'logic_'+str(rule)+'_X_'+is_train+'.npy'))
    y_data = np.load(osp.join(foldername, 'logic_'+str(rule)+'_Y_'+is_train+'.npy'))
    return x_data, y_data




Mounted at gdrive


# Metrics

In [7]:
def gsim(true_groups, predicted_groups):
    # Returns gsim, number of true groups, and number of discovered groups, given
    # true groups and predicted groups as input.
    gsim = 0
    if len(true_groups) == 0: # i.e. we don't know the ground truth.
       return -1, len(true_groups), len(predicted_groups)
    if len(predicted_groups)>0:
      for g in true_groups:
         current_max = 0
         for g_hat in predicted_groups:
            jac = np.intersect1d(g, g_hat).size / np.union1d(g, g_hat).size
            if jac == 1:
               current_max = 1
               break
            if jac > current_max:
               current_max = jac
         gsim += current_max
      gsim /= max(len(true_groups), len(predicted_groups))
      return gsim, len(true_groups), len(predicted_groups)
    else:   # We didn't find anything.
      return 0, len(true_groups), len(predicted_groups)


def tpr_fdr(true_groups, predicted_groups):
   # True positive rate and false discovery rate.
   
   if len(true_groups) == 0:  # Ground truth not known.
      return -1, -1

   if len(predicted_groups) == 0:
      return 0.0, 0.0

   predicted_features = np.unique(reduce(np.union1d, predicted_groups))
   true_features = np.unique(reduce(np.union1d, true_groups))

   overlap = np.intersect1d(predicted_features, true_features).size
   tpr = 100*overlap/len(true_features)
   fdr = 100*(len(predicted_features)-overlap)/len(predicted_features) # If len(predicted_features) != 0 else 0.0.
   return tpr, fdr

# Setup Data

Edit the first cell below to choose which data to use.

In [77]:
# Edit these lines, choice of experiment is syn or chem

choice = 'syn'     # Uncomment one of these 
#choice = 'chem'     # Uncomment one of these
rule = 2
experiment_no = 1

In [78]:
set_seed(experiment_no)

if choice == 'syn':
    train_size = 20000
    test_size = 200
    nfeatures = 500
    batchsize = 500
    lr = 0.001
    nepochs = 400
    lam = 0.1

    X_train, y_train = make_syn_data(rule, train_size, nfeatures, train=True)
    X_test, y_test = make_syn_data(rule, test_size, nfeatures, train=False)
    true_groups = gauss_groups[rule]

if choice == 'chem':
    batchsize = 200
    lr = 0.001
    nepochs = 400
    lam = 0.1

    rules = {
        1: 4,
        2: 10,
        3: 13
    }
    rule = rules[rule]
    

    X_train, y_train = make_chem_data(rule, train=True)
    X_test, y_test = make_chem_data(rule, train=False)
    nfeatures = X_train.shape[-1]
    true_groups = chem_data_groups[rule]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(20000, 500)
(20000,)
(200, 500)
(200,)


# Train STG

In [79]:
args_cuda = torch.cuda.is_available()
device = torch.device("cuda" if args_cuda else "cpu") 
feature_selection = True
model = STG(task_type='classification',input_dim=X_train.shape[1], output_dim=2, hidden_dims=[20, 20], activation='relu',
    optimizer='Adam', learning_rate=lr, batch_size=batchsize, feature_selection=feature_selection, sigma=0.5, lam=lam, device=device) 


In [80]:
model.fit(X_train, y_train, nr_epochs=nepochs, valid_X=X_test, valid_y=y_test, print_interval=25)

Epoch: 25: loss=0.390269 valid_loss=0.227261
Epoch: 50: loss=0.259313 valid_loss=0.163725
Epoch: 75: loss=0.196950 valid_loss=0.133095
Epoch: 100: loss=0.145382 valid_loss=0.117209
Epoch: 125: loss=0.132095 valid_loss=0.090211
Epoch: 150: loss=0.133133 valid_loss=0.096924
Epoch: 175: loss=0.122266 valid_loss=0.081826
Epoch: 200: loss=0.114965 valid_loss=0.080297
Epoch: 225: loss=0.115604 valid_loss=0.068075
Epoch: 250: loss=0.128641 valid_loss=0.083237
Epoch: 275: loss=0.112951 valid_loss=0.072917
Epoch: 300: loss=0.115203 valid_loss=0.065719
Epoch: 325: loss=0.110870 valid_loss=0.074739
Epoch: 350: loss=0.109901 valid_loss=0.068278
Epoch: 375: loss=0.153731 valid_loss=0.064726
Epoch: 400: loss=0.109299 valid_loss=0.060470


## Testing STG

In [81]:
selected = np.where(model.get_gates(mode='prob').astype(int))[0]
y_pred=model.predict(X_test)
acc = 100*np.mean(y_pred==y_test)
tpr, fdr = tpr_fdr(true_groups, [selected])
group_similarity, num_true_groups, num_discovered_groups = gsim(true_groups, [selected])


print('Stg Performance:')
print('Accuracy: {:.3f}%'.format(acc))
print('Selected features: {}'.format(selected))
print('TPR: {:.3f}%'.format(tpr))
print('FDR: {:.3f}%'.format(fdr))
print('Gsim: {:.3f}'.format(group_similarity))
print('Num True Groups: {}'.format(num_true_groups))
print('Num Discovered Groups: {}'.format(num_discovered_groups))

Stg Performance:
Accuracy: 97.500%
Selected features: [0 1 2 3]
TPR: 100.000%
FDR: 0.000%
Gsim: 0.500
Num True Groups: 2
Num Discovered Groups: 1
