In [None]:
import random
import numpy as np
import tensorflow as tf
from keras.models import load_model, Model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, Callback, EarlyStopping
from keras.layers import GlobalAveragePooling2D, BatchNormalization, Input, Dense, Dropout
from keras import backend as K
from keras import optimizers
from keras.utils import to_categorical
import keras.optimizers
import pydot
import networkx as nx
from IPython.display import SVG
import glob, os
import pandas as pd
from random import shuffle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score, mean_absolute_error, mean_squared_error, accuracy_score
from pycausal import search as s
from pycausal.pycausal import pycausal as pc
from collections import defaultdict
from pycausal import prior as p
from causal_assurance import *
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Select your number of models
num_models = 100

# CAM-10 specified by nbest as percentage here.
nbest = int(num_models * 0.1)

# Select your number of iterations to repeat experiment
num_repeat = 100

# select your GPU Here
os.environ["CUDA_VISIBLE_DEVICES"]="" #Comment this line out if you want all GPUS (2 hehe)

dataset_path = '../data/openpowerlifting.csv'

In [None]:
inputs = ['Sex', 'Equipment', 'Age',  'Deadlift1Kg', 'Bench1Kg']
target = ['Squat1Kg']
categoricals = ['Sex', 'Equipment'] 
df = pd.read_csv(dataset_path)
df = df[['Sex', 'Equipment', 'Age', 'Squat1Kg', 'Deadlift1Kg', 'Bench1Kg']]
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
label_encoder_list = []
#one_hot = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
for i,col in enumerate(categoricals):
    label_encoder_list.append(LabelEncoder())
    df[col] = label_encoder_list[i].fit_transform(df[col].values)
df = df[(df['Bench1Kg'] > 0) & (df['Squat1Kg'] > 0) & (df['Deadlift1Kg'] > 0) ]
df.reset_index(drop = True, inplace = True)    
df['Age'] = normalize(df['Age'])
df['Squat1Kg'] = normalize(df['Squat1Kg'])
df['Deadlift1Kg'] = normalize(df['Deadlift1Kg'])
df['Bench1Kg'] = normalize(df['Bench1Kg'])

tempForbid = p.ForbiddenWithin(['Sex', 'Age', 'Equipment'])
temporal = [tempForbid, p.ForbiddenWithin(['Squat1Kg',  'Deadlift1Kg', 'Bench1Kg'])]
prior = p.knowledge( addtemporal = temporal)

g = examine_graph_mixed(df[inputs + target], prior = prior)
dot_str = pc.tetradGraphToDot(g)
graphs = pydot.graph_from_dot_data(dot_str)
svg_str = graphs[0].create_svg()

known_conx = set({})
for i in tetrad.getEdges():
    if ' --> ' in i:
        known_conx.add((i.split(' --> ')[0], i.split(' --> ')[1]))
known_conx

prior = p.knowledge(addtemporal = temporal, requiredirect =  list(map(list, known_conx)),)
models = []
model_names = []



randomize = False
if randomize:
    layers = [256, 512, 1024, 2048, 4096]
    for i in range(num_models):
        network = []
        for j in range(3):
            network.append(layers[random.randint(0,len(layers) -1)])
        models.append(network)
        model_names.append('models/random' + str(i))
    print(models, model_names)    
else:
    model_layers = [2048,1024,512]
    for i in range(num_models):
        models.append(model_layers)
        model_names.append('models/pl' + str(i))

known_conx.add(('Age', 'Squat1Kg'))# Add in knowledge about age to squat
print(models, model_names)
prior = p.knowledge(addtemporal = temporal, requiredirect =  list(map(list, known_conx)),)
SVG(svg_str)

In [None]:
bestMSE = []
bestCOMBO = []
original_df = df.copy()

for t in range(num_repeat):
    # let's split our df into two by race.  Let's see what happens if we 
    
    # it's a huge dataset so to speed this up, you could uncomment this.
    # Also depending on the size of your machine, you may have to downsize this.
    df = original_df[:1000].copy()
    
    holdout = int(len(df) * 0.2)
    continuous = ['Age', 'Squat1Kg',  'Deadlift1Kg', 'Bench1Kg']

    small = random.randint(0,1)
    cont = random.randint(0, len(continuous) - 1)
    if small == 0:
        df_test = df.nsmallest(holdout, continuous[cont])
    else:
        df_test = df.nlargest(holdout, continuous[cont])
    print(t, small, continuous[cont])

    df.drop(df_test.index, inplace = True)
    df_test.reset_index(inplace = True)
    df.sample(frac= 1).reset_index(inplace = True) # this will shuffle and reset index

    x_test = df_test[inputs]
    y_test = df_test[target]

    causal_split = 0.2
    val_split = 0.2
    train_split = 1 - (causal_split + val_split)

    x_causal = df[inputs][-int(causal_split * len(df)) :]
    y_causal = df[target][-int(causal_split * len(df)) :]

    x_val = df[inputs][int(train_split * len(df)):-int(causal_split * len(df))]
    y_val = df[target][int(train_split * len(df)):-int(causal_split * len(df))]

    x_train = df[inputs][:int(train_split * len(df))]
    y_train = df[target][:int(train_split * len(df))]

    x_test_NN = make_categorical(x_test, original_df, categoricals)
    x_causal_NN = make_categorical(x_causal, original_df, categoricals)
    x_val_NN = make_categorical(x_val, original_df, categoricals)
    x_train_NN = make_categorical(x_train, original_df, categoricals)
    verbosity = 0

    for idx, model_name in enumerate(model_names):
        if type(models[idx]) is list:
            #clear session
            keras.backend.clear_session() 
            #get model according to specification
            model = get_model(models[idx], [0.2] * len(models), np.shape(x_train_NN)[1])
            callbacks = [ModelCheckpoint(model_name, verbose= verbosity, monitor='val_loss',save_best_only=True), 
                         EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose= verbosity, mode='auto')]
            model.compile(optimizer = optimizers.SGD(lr = 0.001, momentum = 0.9, ), loss='mean_squared_error', metrics = ['mse'])
            model.fit(x_train_NN, y_train, epochs = 20, validation_data = (x_val_NN, y_val), callbacks = callbacks, batch_size = 32, verbose = verbosity)
        else:
            models[idx].fit(X,y)





    generalization = []
    metrics = []
    proposed = []
    x_causal.reset_index(drop = True, inplace = True)

    for idx, model_name in enumerate(model_names):
        if type(models[idx]) is list:
            keras.backend.clear_session()
            model = load_model(model_name)
        else:
            model = models[idx]

        y_pred = model.predict(x_test_NN)
        generalization.append(mean_squared_error(y_pred, y_test))

        #### CHECK FOR CAUSAL METRIC HERE
        y_causal_pred = model.predict(x_causal_NN)
        causal_targets = pd.DataFrame(y_causal_pred, columns = target)
        causal_targets.reset_index(drop=True, inplace = True)
        causal_df = x_causal.join(causal_targets)

        metrics.append(mean_squared_error(y_causal_pred, y_causal))
        ll_pred = get_ll_mixed(causal_df, prior)
        proposed.append(ll_pred)

    total = normalize(metrics) + normalize(proposed)
    final = pd.DataFrame(np.stack((metrics, proposed, total, normalize(generalization)), axis = 1), columns = ['metrics', 'proposed', 'combined', 'generalization'])
    print("MSE = ", np.sum(final.nsmallest(nbest, 'metrics')['generalization']))
    print("COMB = ",np.sum(final.nsmallest(nbest, 'combined')['generalization']))
    bestMSE.append(final.nsmallest(nbest, 'metrics')['generalization'].values)
    bestCOMBO.append(final.nsmallest(nbest, 'combined')['generalization'].values)
np.mean(bestMSE), np.mean(bestCOMBO), np.std(bestMSE), np.std(bestCOMBO)

In [None]:
import seaborn as sns
val1 = []
for each in bestMSE:
    val1.append(np.mean(each))
val2 = []
for each in bestCOMBO:
    val2.append(np.mean(each))

val = []
for x, y in zip(val1, val2):
    val.append([x, y])
    
fig, ax = plt.subplots()
fig.set_size_inches(2.5,3)
df = pd.DataFrame(val, columns = ['MSE', 'Proposed'])
ax = sns.boxplot(ax = ax, data=df, palette="Set2")
fig.savefig('kaggle-bikeshare.pdf')
d = dict()
d['bestMSE'] = bestMSE
d['bestCOMBO'] = bestCOMBO


import pickle

with open('Powerlifting.pkl', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)