## Notes 
### Required installing Oracle JAVA 8 to get javabridge installed
### Then, I was able to install py-causal from https://bd2kccd.github.io/docs/py-causal/
### GFCI is slower than RFCI, but more accurate (SPIRTES), GFCI and RFCI account for unobserved variables, FGES assumes no unobserved variables.

Structure Learning Performance Guarantees If the assumptions in the previous section hold, then in the large sample limit, the CBN structure output by GFCId will contain an edge of one of four kinds between Xand Y   if and only if Xand Yare not independent conditional on any subset of the other measured variables of less than or equal to a specified size. In addition, there is (1) an arc X->Y   if and only if Xdirectly or indirectly causes Y, and Y   does not directly or indirectly cause X; (2) an edge X <-->Y   if and only if X   is not a direct or indirect cause of Yand Y   is not a direct or indirect cause of X(which can only occur if there are latent confounders of Xand some other variable or Yand some other variable; (3) an edge Xo->Y   only if Yis not a direct or indirect cause of X, but Xmay or may not be an indirect cause of Y; (4) an edge X o–o Y   indicates that Xand Y   are dependent no matter what subset of observed variables is conditioned on, but contains no orientation information (X   may be a direct or indirect cause of Y, and Ymay be an indirect cause of X, or there may be a latent common cause of Xand Y.

# Trying some various ML models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import roc_auc_score, average_precision_score, mean_squared_error, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
from pycausal import search as s
import configparser
import random
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, average_precision_score
from keras.models import load_model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, Callback
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.models import load_model, Model
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, BatchNormalization, \
                        Input, Dense, GlobalAveragePooling2D, Dropout
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.utils import to_categorical
from collections import Counter
import keras.optimizers
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
import glob, os
import tensorflow as tf
import pandas as pd
from random import shuffle

# select your GPU Here
os.environ["CUDA_VISIBLE_DEVICES"]="1" #Comment this line out if you want all GPUS (2 hehe)

# python full-display web browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


def get_model(dense, dropouts, inputs, target_len):
    # dense is an ordered list of the number of dense neurons like [1024, 2048, 1024]
    # dropouts is an ordered list of the dropout masks like [0.2, 0.3, 0.4]
    inputs = keras.Input(shape = (inputs,))

    x = keras.layers.Dense(dense[0], activation = 'relu')(inputs)
    x = keras.layers.Dropout(dropouts[0])(x, training=True)
    for den, drop in zip(dense[1:], dropouts[1:]):
        x = keras.layers.Dense(den, activation = 'relu')(x)
        x = keras.layers.Dropout(drop)(x, training=True)
    outputs = keras.layers.Dense(target_len, activation = 'softmax')(x)
    model = keras.Model(inputs, outputs)
    return model


def discrete_gauss(low, high, samples, std = 20):
    x = np.arange(low, high)
    xU, xL = x + 0.5, x - 0.5 
    prob = ss.norm.cdf(xU, scale = std) - ss.norm.cdf(xL, scale = std)
    prob = prob / prob.sum() #normalize the probabilities so their sum is 1
    nums = np.random.choice(x, size = samples, p = prob)
    return nums



def bar_plot(x_ax, val1, val1std, val2, val2std):
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ## the data
    N = len(x_ax)

    ## necessary variables
    ind = np.arange(N)                # the x locations for the groups
    width = 0.35                      # the width of the bars
    fig.set_size_inches(18.5, 10.5)
    ## the bars
    rects1 = ax.bar(ind, val1, width,
                    color='gray',
                    yerr=val1std,
                    error_kw=dict(elinewidth=2,ecolor='blue'))

    rects2 = ax.bar(ind+width, val2, width,
                        color='blue',
                        #yerr=val2std,
                        error_kw=dict(elinewidth=2,ecolor='gray'))

    # axes and labels
    ax.set_xlim(-width,len(ind)+width)
    #ax.set_ylim(0,45)
    ax.set_ylabel('Percentage')
    ax.set_title('')
    plt.xticks(ind + width / 2, x_ax, rotation=75, size = 14)
    ## add a legend
    ax.legend( (rects1[0], rects2[0]), ('Accuracy', '% Violations') )
    fig.savefig("violations.pdf", bbox_inches='tight')
    plt.show()

import pandas as pd
import numpy as np

def normalize(x):
    return (x - x.min(0)) / x.ptp(0)

def gen_data(mean = 0, var = 1, SIZE = 20000):
    # set bmi to these values real world mean and standard deviation for a certain country.
    bmi = np.random.normal(25,5, SIZE)
    estrogen =  np.random.normal(bmi, 10) +  np.random.normal(mean,var, SIZE)
    
    age = np.random.normal(55,10, SIZE)
    genes = np.random.normal(age, 10,SIZE) +   np.random.normal(mean,var, SIZE)
    
    insomnia = np.random.normal(estrogen, 8,SIZE) +   np.random.normal(mean,var, SIZE)
    density = np.random.normal(estrogen, 4, SIZE) + np.random.normal(genes,12, SIZE) + np.random.normal(mean,var + 10, SIZE)

    cancer = np.zeros_like(density)
    m = np.mean(density)

    cancer[density > m] = np.random.binomial(n=1, p=0.08, size=len(density[density > m]))
    cancer[density <= m] = np.random.binomial(n=1, p=0.01, size=len(density[density <= m]))
    
    return pd.DataFrame({'bmi' : bmi,'density' : density, 'age' : age, 'cancer' : cancer, 'estrogen': estrogen, 'genes':genes, 'insomnia': insomnia})

def gen_data_perturbed(mean = 0, var = 1, SIZE = 20000):
    bmi = np.random.normal(30,3, SIZE)
    
    age = np.random.normal(60,14, SIZE) + np.random.normal(-bmi,var, SIZE) 
    income = np.random.normal(age, var,SIZE) + np.random.normal(10,12, SIZE)
    density = np.random.normal(-bmi,var, SIZE) + np.random.normal(-age,var, SIZE) + np.random.normal(mean,var, SIZE)
    cancer = np.zeros_like(density)
    m = np.mean(density)
    print(m)
    cancer[density > m] = np.random.binomial(n=1, p=0.08, size=len(density[density > m]))
    cancer[density <= m] = np.random.binomial(n=1, p=0.01, size=len(density[density <= m]))
    
    return pd.DataFrame({'bmi' : bmi,'density' : density, 'age' : age, 'cancer' : cancer, 'income':income})
def gen_data_perturbed(mean = 2, var = 5, SIZE = 20000):
    # set bmi to these values real world mean and standard deviation for a certain country.
    bmi = np.random.normal(25,5, SIZE)
    estrogen =  np.random.normal(bmi, 10) +  np.random.normal(mean,var, SIZE)
    
    age = np.random.normal(55,10, SIZE)
    genes = np.random.normal(age, 10,SIZE) +   np.random.normal(mean,var, SIZE)
    
    insomnia = np.random.normal(estrogen, 8,SIZE) +   np.random.normal(mean,var, SIZE)
    density = np.random.normal(estrogen, 4, SIZE) + np.random.normal(genes,12, SIZE) + np.random.normal(mean,var + 10, SIZE)

    cancer = np.zeros_like(density)
    m = np.mean(density)

    cancer[density > m] = np.random.binomial(n=1, p=0.08, size=len(density[density > m]))
    cancer[density <= m] = np.random.binomial(n=1, p=0.01, size=len(density[density <= m]))
    
    return pd.DataFrame({'bmi' : bmi,'density' : density, 'age' : age, 'cancer' : cancer, 'estrogen': estrogen, 'genes':genes, 'insomnia': insomnia})

def get_CG(df, tetrad):
    tetrad.run(algoId = 'gfci', dfs = df, testId = 'sem-bic', scoreId = 'sem-bic', dataType = 'continuous',
           structurePrior = 1.0, samplePrior = 1.0, maxDegree = -1, maxPathLength = -1, 
           completeRuleSetUsed = False, faithfulnessAssumed = True, verbose = True)
    #tetrad.run(algoId = 'fges-mb', targetName = 'g', dfs = df, testId = 'sem-bic', scoreId = 'sem-bic', dataType = 'continuous',
    #       structurePrior = 1.0, samplePrior = 1.0, maxDegree = -1, maxPathLength = -1, 
    #       completeRuleSetUsed = False, faithfulnessAssumed = True, verbose = True)


    return tetrad.getTetradGraph()



from pycausal.pycausal import pycausal as pc
from collections import defaultdict
pc = pc()
pc.start_vm(java_max_heap_size = '5000M')
tetrad = s.tetradrunner()


verbosity = 1



models = []
model_names = []

num_models =100
model_layers = [1024,512]
for i in range(num_models):
    models.append(model_layers)
    model_names.append('temp/a' + str(i))

print(models, model_names)

from pycausal import prior as p
def get_bic(df, prior):

    tetrad.run(algoId = 'gfci', dfs = df,  scoreId = 'sem-bic-deterministic', dataType = 'continuous',
               structurePrior = 1.0, samplePrior = 1, maxDegree = -1, maxPathLength = -1, priorKnowledge = prior,
               completeRuleSetUsed = False, faithfulnessAssumed = True, verbose = True,
               penaltyDiscount = 2
               )
    BIC = tetrad.getTetradGraph().getAllAttributes().toString()
    BIC = float(BIC.split('=')[-1].split('}')[0])
    return BIC #/ len(df)
import itertools
def get_pairs(lst):
    a = set()
    for i in itertools.permutations(lst,2):
        a.add(i)
    return a

inputs = ['bmi', 'density', 'age', 'genes', 'insomnia', 'estrogen']
target = ['cancer']
full_conx = get_pairs(inputs + target)
forced_conx = set({('age','genes'), ('bmi', 'estrogen'), ('estrogen', 'genes'),('estrogen', 'insomnia'), ('estrogen', 'density'), ('genes', 'density'), ('density', 'cancer')})
restricted_conx = full_conx.difference(forced_conx)   

prior = p.knowledge(requiredirect =  list(map(list, forced_conx)),
                       forbiddirect = list(map(list, restricted_conx))
                       )






df = gen_data(SIZE = 200000)
cancer_df = df[df['cancer'] == 1]

ben_df = df[df['cancer'] == 0][:len(cancer_df)]

df = cancer_df.append(ben_df, ignore_index=True)
print(len(cancer_df), len(ben_df), len(df))


X = df[inputs].values
X = normalize(X)
y = df[target].values
y = to_categorical(y)

val_df = gen_data(SIZE = 2000)

x_val = val_df[inputs].values
x_val = normalize(x_val)
y_val = val_df[target].values
y_val = to_categorical(y_val)

get_bic(df,prior)

  from numpy.core.umath_tests import inner1d
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[[1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512], [1024, 512]

-507936.57097312226

In [2]:
for idx, model_name in enumerate(model_names):
    print(model_name)

    if type(models[idx]) is list:
        #clear session
        keras.backend.clear_session() 
        #get model according to specification
        model = get_model(models[idx], [0.2] * len(models), len(inputs), 2)
        callbacks = [ModelCheckpoint(model_name, verbose= verbosity, monitor='val_loss',save_best_only=True), 
                     EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose= verbosity, mode='auto')]
        model.compile(optimizer = optimizers.SGD(lr = 0.01, momentum = 0.9, ), loss='categorical_crossentropy', metrics = ['accuracy'])
        #print(len(X), len(y))
        model.fit(X, y, epochs = 20, validation_data = (x_val, y_val), callbacks = callbacks, batch_size = 32, verbose = verbosity)
    else:
        models[idx].fit(X,y)


temp/a0
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.54322, saving model to temp/a0
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.54322
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.54322
Epoch 00003: early stopping
temp/a1
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.70069, saving model to temp/a1
Epoch 2/20

Epoch 00002: val_loss improved from 0.70069 to 0.51265, saving model to temp/a1
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.51265
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.51265
Epoch 00004: early stopping
temp/a2
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.66040, saving model to temp/a2
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.66040
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.66040
Epoch 00003: early stopping
temp/a3
Tra


Epoch 00006: val_loss did not improve from 0.49711
Epoch 00006: early stopping
temp/a9
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.70904, saving model to temp/a9
Epoch 2/20

Epoch 00002: val_loss improved from 0.70904 to 0.52904, saving model to temp/a9
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.52904
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.52904
Epoch 00004: early stopping
temp/a10
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.60594, saving model to temp/a10
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.60594
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.60594
Epoch 00003: early stopping
temp/a11
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.54022, saving model to temp/a11
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.54022
Epoch 3/20

Epoch 


Epoch 00001: val_loss improved from inf to 0.73676, saving model to temp/a17
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.73676
Epoch 3/20

Epoch 00003: val_loss improved from 0.73676 to 0.61127, saving model to temp/a17
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.61127
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.61127
Epoch 00005: early stopping
temp/a18
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.65141, saving model to temp/a18
Epoch 2/20

Epoch 00002: val_loss improved from 0.65141 to 0.61031, saving model to temp/a18
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.61031
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.61031
Epoch 00004: early stopping
temp/a19
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.67608, saving model to temp/a19
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.67608
Epoch 3/


Epoch 00004: val_loss improved from 0.62844 to 0.62050, saving model to temp/a25
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.62050
Epoch 6/20

Epoch 00006: val_loss improved from 0.62050 to 0.55086, saving model to temp/a25
Epoch 7/20

Epoch 00007: val_loss improved from 0.55086 to 0.48074, saving model to temp/a25
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.48074
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.48074
Epoch 00009: early stopping
temp/a26
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.72138, saving model to temp/a26
Epoch 2/20

Epoch 00002: val_loss improved from 0.72138 to 0.62808, saving model to temp/a26
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.62808
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.62808
Epoch 00004: early stopping
temp/a27
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.60378,


Epoch 00004: val_loss did not improve from 0.54235
Epoch 00004: early stopping
temp/a33
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.85128, saving model to temp/a33
Epoch 2/20

Epoch 00002: val_loss improved from 0.85128 to 0.56545, saving model to temp/a33
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.56545
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.56545
Epoch 00004: early stopping
temp/a34
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.48696, saving model to temp/a34
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.48696
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.48696
Epoch 00003: early stopping
temp/a35
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.69111, saving model to temp/a35
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.69111
Epoch 3/20

Epo


Epoch 00005: val_loss did not improve from 0.50503
Epoch 00005: early stopping
temp/a41
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.68766, saving model to temp/a41
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.68766
Epoch 3/20

Epoch 00003: val_loss improved from 0.68766 to 0.64930, saving model to temp/a41
Epoch 4/20

Epoch 00004: val_loss improved from 0.64930 to 0.55805, saving model to temp/a41
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.55805
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.55805
Epoch 00006: early stopping
temp/a42
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.63975, saving model to temp/a42
Epoch 2/20

Epoch 00002: val_loss improved from 0.63975 to 0.61610, saving model to temp/a42
Epoch 3/20

Epoch 00003: val_loss improved from 0.61610 to 0.59412, saving model to temp/a42
Epoch 4/20

Epoch 00004: val_loss imp


Epoch 00006: val_loss did not improve from 0.59041
Epoch 7/20

Epoch 00007: val_loss improved from 0.59041 to 0.56177, saving model to temp/a46
Epoch 8/20

Epoch 00008: val_loss improved from 0.56177 to 0.55236, saving model to temp/a46
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.55236
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.55236
Epoch 00010: early stopping
temp/a47
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.84045, saving model to temp/a47
Epoch 2/20

Epoch 00002: val_loss improved from 0.84045 to 0.56789, saving model to temp/a47
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.56789
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.56789
Epoch 00004: early stopping
temp/a48
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.64298, saving model to temp/a48
Epoch 2/20

Epoch 00002: val_loss improved from 0.64298 to 0.51768


Epoch 00005: val_loss did not improve from 0.48445
Epoch 00005: early stopping
temp/a54
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.68542, saving model to temp/a54
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.68542
Epoch 3/20

Epoch 00003: val_loss improved from 0.68542 to 0.53111, saving model to temp/a54
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.53111
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.53111
Epoch 00005: early stopping
temp/a55
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.73233, saving model to temp/a55
Epoch 2/20

Epoch 00002: val_loss improved from 0.73233 to 0.70411, saving model to temp/a55
Epoch 3/20

Epoch 00003: val_loss improved from 0.70411 to 0.64177, saving model to temp/a55
Epoch 4/20

Epoch 00004: val_loss improved from 0.64177 to 0.58502, saving model to temp/a55
Epoch 5/20

Epoch 00005: val_loss imp


Epoch 00001: val_loss improved from inf to 0.53170, saving model to temp/a60
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.53170
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.53170
Epoch 00003: early stopping
temp/a61
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.66974, saving model to temp/a61
Epoch 2/20

Epoch 00002: val_loss improved from 0.66974 to 0.62501, saving model to temp/a61
Epoch 3/20

Epoch 00003: val_loss improved from 0.62501 to 0.54668, saving model to temp/a61
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.54668
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.54668
Epoch 00005: early stopping
temp/a62
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.65285, saving model to temp/a62
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.65285
Epoch 3/20

Epoch 00003: val_loss improved from 0.65285 to 0.56658, sav


Epoch 00001: val_loss improved from inf to 0.68524, saving model to temp/a68
Epoch 2/20

Epoch 00002: val_loss improved from 0.68524 to 0.59099, saving model to temp/a68
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.59099
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.59099
Epoch 00004: early stopping
temp/a69
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.72130, saving model to temp/a69
Epoch 2/20

Epoch 00002: val_loss improved from 0.72130 to 0.44042, saving model to temp/a69
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.44042
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.44042
Epoch 00004: early stopping
temp/a70
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.73138, saving model to temp/a70
Epoch 2/20

Epoch 00002: val_loss improved from 0.73138 to 0.71788, saving model to temp/a70
Epoch 3/20

Epoch 00003: val_loss improve


Epoch 00004: val_loss improved from 0.59586 to 0.53237, saving model to temp/a76
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.53237
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.53237
Epoch 00006: early stopping
temp/a77
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.65049, saving model to temp/a77
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.65049
Epoch 3/20

Epoch 00003: val_loss improved from 0.65049 to 0.51580, saving model to temp/a77
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.51580
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.51580
Epoch 00005: early stopping
temp/a78
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.70888, saving model to temp/a78
Epoch 2/20

Epoch 00002: val_loss improved from 0.70888 to 0.60499, saving model to temp/a78
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.60499
Epoc


Epoch 00002: val_loss did not improve from 0.59844
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.59844
Epoch 00003: early stopping
temp/a84
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.66999, saving model to temp/a84
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.66999
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.66999
Epoch 00003: early stopping
temp/a85
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.65646, saving model to temp/a85
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.65646
Epoch 3/20

Epoch 00003: val_loss improved from 0.65646 to 0.59986, saving model to temp/a85
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.59986
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.59986
Epoch 00005: early stopping
temp/a86
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improve

Epoch 3/20

Epoch 00003: val_loss improved from 0.69328 to 0.60230, saving model to temp/a91
Epoch 4/20

Epoch 00004: val_loss improved from 0.60230 to 0.58932, saving model to temp/a91
Epoch 5/20

Epoch 00005: val_loss improved from 0.58932 to 0.58776, saving model to temp/a91
Epoch 6/20

Epoch 00006: val_loss improved from 0.58776 to 0.57918, saving model to temp/a91
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.57918
Epoch 8/20

Epoch 00008: val_loss improved from 0.57918 to 0.56784, saving model to temp/a91
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.56784
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.56784
Epoch 00010: early stopping
temp/a92
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.61289, saving model to temp/a92
Epoch 2/20

Epoch 00002: val_loss improved from 0.61289 to 0.59722, saving model to temp/a92
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.59722
Epoch 4/20

Ep


Epoch 00001: val_loss improved from inf to 0.73176, saving model to temp/a98
Epoch 2/20

Epoch 00002: val_loss improved from 0.73176 to 0.65690, saving model to temp/a98
Epoch 3/20

Epoch 00003: val_loss improved from 0.65690 to 0.51139, saving model to temp/a98
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.51139
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.51139
Epoch 00005: early stopping
temp/a99
Train on 18084 samples, validate on 2000 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.66897, saving model to temp/a99
Epoch 2/20

Epoch 00002: val_loss improved from 0.66897 to 0.57649, saving model to temp/a99
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.57649
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.57649
Epoch 00004: early stopping


In [3]:


from sklearn.metrics import roc_auc_score
nb_test = 2000
metrics_dicts = []

perturbed_df = gen_data_perturbed(SIZE = nb_test)
y_test2 = perturbed_df[target]
x_test2 = normalize(perturbed_df[inputs].values)
for idx, model_name in enumerate(model_names):

    if type(models[idx]) is list:
        keras.backend.clear_session()
        model = load_model(model_name)
    else:
        model = models[idx]
    y_pred2 = model.predict(x_test2)[:,1]
    print(roc_auc_score(y_test2, y_pred2))
    metrics_dicts.append(roc_auc_score(y_test2, y_pred2))



IndexError: index 1 is out of bounds for axis 1 with size 1

In [None]:
#the number of times to sample
times = 3
## the size of the test set


violations = np.zeros(len(models))
violation_mean = np.zeros((len(models), times))
violation_mean2 = np.zeros((len(models), times))
mean = np.zeros((len(models), times))

fold = 0



for t in range(times):
    print("Times = ", t)
    df_test = gen_data(SIZE = nb_test)
    x_test = df_test[inputs].values
    x_test_norm = normalize(df_test[inputs].values)
    y_test = df_test[target].values
    #bic_orig = get_bic(df_test,prior)

    for idx, model_name in enumerate(model_names):
        if type(models[idx]) is list:
            keras.backend.clear_session()
            model = load_model(model_name)
        else:
            model = models[idx]
            
        predicted = model.predict(x_test_norm)[:,1]
        test_df = pd.DataFrame(x_test, columns = inputs)
        test_targets = pd.DataFrame(predicted,columns = target)
        test_df = test_df.join(test_targets)
       
        
    
        mean[idx][t] = roc_auc_score(y_test, predicted) 
        test_df[test_df['cancer'] > 0.5] = 1
        test_df[test_df['cancer'] <= 0.5] = 0
        bic_pred = get_bic(test_df,prior)
        
        #bic_pred = get_bic(df_test.join(pd.DataFrame(model.predict(x_test), columns = ['target'])), prior)
        
        print(tetrad.getEdges())
        print(bic_pred)
        violation_mean[idx][t] = bic_pred
        violation_mean2[idx][t] = bic_pred
        #print(bic_orig - bic_pred)
metric = []
metric_err = []
viol = []
viol_err = []

#normalize the violations for prettier graphing.
#also violations are always positive, so just divide by max.

#TMK
#violation_mean = violation_mean / np.max(violation_mean)

for i in range(len(violations)):
    print("Model_name = ", model_names[i], "Violations = ", violations[i])
    print("Average_violations = ", np.mean(violation_mean[i]), np.std(violation_mean[i]))
    print("MSE = ", np.mean(mean[i]), np.std(mean[i]))
    #print("mean = ", mean[i])
    metric.append(np.mean(mean[i]))
    metric_err.append(np.std(mean[i]))
    viol.append(np.mean(violation_mean[i]))
    #viol.append(violations[i]/times)
    viol_err.append(np.std(violation_mean[i]))
print(np.array(metric), 
         np.array(metric_err), 
         np.array(viol), 
         np.array(viol_err))    

bar_plot(model_names, 
         np.array(metric), 
         np.array(metric_err), 
         np.array(viol), 
         np.array(viol_err))


    
MSE = []
VIO = []
VIO2 = []
AUS = []
for i, m in enumerate(models):
    MSE.append(np.mean(mean[i]))
    VIO.append(np.mean(violation_mean[i]))
    VIO2.append(np.mean(violation_mean2[i]))
    AUS.append(metrics_dicts[i])


In [None]:
from numpy.polynomial.polynomial import polyfit  
from scipy.stats import pearsonr
from pylab import text

def norm(a):
    return (a - np.min(a)) / a.ptp()
METRIC = norm(-np.array(VIO)) + np.array(MSE)
n_low = int(num_models * 0.2)
sorted_aus = [AUS for _,AUS in sorted(zip(VIO,AUS))]

print("Best by BIC = ", np.mean(sorted_aus[:n_low]))

sorted_aus = [AUS for _,AUS in sorted(zip(MSE,AUS))]
print("Best by AUC = ", np.mean(sorted_aus[:n_low]))


sorted_aus = [AUS for _,AUS in sorted(zip(METRIC,AUS))]
print("Best by MET = ", np.mean(sorted_aus[:n_low]))

#sorted_aus = [AUS for _,AUS in sorted(zip(METRIC,AUS))]
print("Random = ", np.mean(AUS[:n_low]))

print(pearsonr(VIO,AUS)[0])
fig, ax = plt.subplots()
b,m = polyfit(VIO,AUS, 1)
ax.plot(VIO,AUS, '.')
text(0.05, 0.9,'Pearson coeff:' + str(pearsonr(VIO,AUS)[0])[0:6], ha='left', va='center', transform=ax.transAxes)
plt.plot(VIO, b + m * np.array(VIO), '-')
ax.set_xlabel("BIC")
ax.set_ylabel("OoS AUCROC")
fig.savefig('Ex4VIOVsAUS.pdf', bbox_inches='tight')
plt.show()



print(pearsonr(METRIC,AUS)[0])
fig, ax = plt.subplots()
b,m = polyfit(METRIC,AUS, 1)
ax.plot(METRIC,AUS, '.')
text(0.05, 0.9,'Pearson coeff:' + str(pearsonr(METRIC,AUS)[0])[0:6], ha='left', va='center', transform=ax.transAxes)
plt.plot(METRIC, b + m * np.array(METRIC), '-')
    #cax = ax.scatter(VIO,AUS)
ax.set_xlabel("Combined")
ax.set_ylabel("OoS AUCROC")
fig.savefig('Ex4ProposedVsAUS.pdf', bbox_inches='tight')
plt.show()


fig, ax = plt.subplots()
b,m = polyfit(MSE,AUS, 1)
text(0.05, 0.9,'Pearson coeff:' + str(pearsonr(MSE,AUS)[0])[0:6], ha='left', va='center', transform=ax.transAxes)
ax.plot(MSE,AUS, '.')
plt.plot(MSE, b + m * np.array(MSE), '-')
    #cax = ax.scatter(VIO,AUS)
ax.set_xlabel("AUC")
ax.set_ylabel("OoS AUCROC")
fig.savefig('Ex4MSEVsAUS.pdf', bbox_inches='tight')
plt.show()


MSE = np.array(MSE)

x = []
y1 = []
y2 = []
y3 = []
for split in range(10, len(AUS), 5):
    #print("******", split, "*******")
    sorted_aus = [AUS for _,AUS in sorted(zip(VIO,AUS))]
    sorted_mse = [MSE for _,MSE in sorted(zip(VIO,MSE))]

    low = []
    high = []
    low = sorted_aus[:split]
    high = sorted_aus[split:]

    x.append(split)
    
    
    #print("Low Violations = ", np.mean(low), "for", len(low))
    #print("High Violations = ", np.mean(high), "for", len(high))
    y1.append(np.mean(low)) 
    sorted_aus_by_mse = [AUS for _,AUS in sorted(zip(MSE,AUS))]
    low = sorted_aus_by_mse[:split]
    high = sorted_aus_by_mse[split:]
    #print("Low AUS by MSE = ", np.mean(low), "for", len(low))
    #print("High AUS by MSE = ", np.mean(high), "for", len(high))
    y2.append(np.mean(low))
    sorted_aus = [AUS for _,AUS in sorted(zip(METRIC,AUS))]
    sorted_mse = [MSE for _,MSE in sorted(zip(METRIC,MSE))]

    low = []
    high = []
    low = sorted_aus[:split]
    high = sorted_aus[split:]



    #print("Low Metric = ", np.mean(low), "for", len(low))
    #print("High Metric = ", np.mean(high), "for", len(high))
    y3.append(np.mean(low))
    

fig, ax = plt.subplots()

ax.plot(x,y1, '-', label = 'BIC')
ax.plot(x,y2, '-', label = 'MSE')
ax.plot(x,y3, '-', label = 'METRIC')
ax.legend()

ax.set_xlabel("MSE")
ax.set_ylabel("Out of Sample AUCROC")
plt.show()  
pearsonr(METRIC,AUS)[0]

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

model.predict(X)

In [None]:
sfm = SelectFromModel(model, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]