In [1]:
import sys
# Append root path 
sys.path.append("../")
sys.path.append("../lmmnn")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

if tf.test.gpu_device_name() != '/device:GPU:0':
    print('WARNING: GPU device not found.')
else:
    print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))

from model.mixed_effects import *
from utils.fe_models import get_model
from utils.evaluation import *
from utils.utils import *
from data.preprocessing import dataset_preprocessing

# from vis.utils.utils import apply_modifications
# helper function
def update_layer_activation(model, activation, index=-1):
    model.layers[index].activation = activation
    return model

from tensorflow.keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Reshape, Embedding, Concatenate
from tensorflow.keras.activations import sigmoid

from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import roc_auc_score as auroc
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from tensorflow_addons.metrics import F1Score

from scipy import stats
import pickle
import yaml
import time
import gc

RS = 555

SUCCESS: Found GPU: /device:GPU:0


#### Download and save data from Pargent et al. by running "data/download_pargent2022_datasets.py before running this notebook

In [2]:
mode="cv"
hct=10
test_ratio=None
val_ratio=None
folds=5
results = {}
dataset_names = ["eucalyptus", "Midwest_survey", "hpc-job-scheduling", "video-game-sales", "okcupid-stem", "Diabetes130US"]


loss_use = lambda: tf.keras.losses.CategoricalCrossentropy
target= "categorical"
batch_size=512
epochs = 200
early_stopping = 5
model_name = "ResNet"
embed_dims_method = "AutoGluon"

results = {}

#######################################

for dataset_name in dataset_names:
    print(f"Start training procedure for {dataset_name}")
    data_path = f"{mode}_RS{RS}_hct{hct}"
    if mode == "cv":
        data_path += f"_{folds}folds"
    elif mode == "train_test":
        data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
    elif mode == "train_val_test":
        data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"

    # If no data_dict exists, run preprocessing, else load data_dict
    if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
        dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
    with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
            data_dict = pickle.load(handle)

    z_cols = data_dict["z_cols"]
    
    results[dataset_name] = {}
    for fold_num in range(folds):
        results[dataset_name][fold_num] = {}

        print(f"Fold no. {fold_num}")
        save_path = f"../results/{dataset_name}/{data_path}/fold_{fold_num}/ResNet_10"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        z_ohe_encoded_train = data_dict[f"z_ohe_encoded_train_{fold_num}"] 
        z_ohe_encoded_val = data_dict[f"z_ohe_encoded_val_{fold_num}"] 
        z_ohe_encoded_test = data_dict[f"z_ohe_encoded_test_{fold_num}"] 

        z_target_encoded_train = data_dict[f"z_target_encoded_train_{fold_num}"] 
        z_target_encoded_val = data_dict[f"z_target_encoded_val_{fold_num}"] 
        z_target_encoded_test = data_dict[f"z_target_encoded_test_{fold_num}"] 
        
        target_encoding_time = data_dict[f"target_encoding_time_{fold_num}"]
        ohe_encoding_time = data_dict[f"ohe_encoding_time_{fold_num}"]
        
        x_cols = data_dict[f"X_train_{fold_num}"].columns
        X_train = data_dict[f"X_train_{fold_num}"]
        Z_train = data_dict[f"Z_train_{fold_num}"]
        y_train = data_dict[f"y_train_{fold_num}"]

        X_val = data_dict[f"X_val_{fold_num}"]
        Z_val = data_dict[f"Z_val_{fold_num}"]
        y_val = data_dict[f"y_val_{fold_num}"]

        X_test = data_dict[f"X_test_{fold_num}"]
        Z_test = data_dict[f"Z_test_{fold_num}"]
        y_test = data_dict[f"y_test_{fold_num}"]
    
        if not os.path.exists(f"{save_path}/results_RS{RS}_{dataset_name}_iter{fold_num}.pickle"):

            tf.random.set_seed(RS+fold_num)
            np.random.seed(RS+fold_num)

            qs = np.max([tf.reduce_max(Z_train, axis=0),tf.reduce_max(Z_val, axis=0),tf.reduce_max(Z_test, axis=0)],axis=0)+1
            
            X_train = tf.convert_to_tensor(X_train)
            Z_train = tf.convert_to_tensor(Z_train,dtype=tf.int32)
            y_train = tf.convert_to_tensor(y_train)

            X_val = tf.convert_to_tensor(X_val)
            Z_val = tf.convert_to_tensor(Z_val,dtype=tf.int32)
            y_val = tf.convert_to_tensor(y_val)

            X_test = tf.convert_to_tensor(X_test)
            Z_test = tf.convert_to_tensor(Z_test,dtype=tf.int32)
            y_test = tf.convert_to_tensor(y_test)

            if target == "categorical":
                n_classes = np.unique(y_train).shape[0]
            elif target=="binary":
                n_classes = 1
            
            y_train = tf.one_hot(tf.cast(y_train,tf.int32),n_classes)
            y_val = tf.one_hot(tf.cast(y_val,tf.int32),n_classes)
            y_test = tf.one_hot(tf.cast(y_test,tf.int32),n_classes)
            
            ##### GMENN #####
            d = X_train.shape[1] # columns
            n = X_train.shape[0] # rows
            num_outputs = n_classes
            perc_numeric = d/(d+Z_train.shape[1])

#             qs = np.max([tf.reduce_max(Z_train, axis=0),tf.reduce_max(Z_val, axis=0),tf.reduce_max(Z_test, axis=0)],axis=0)+1

            set_seed(RS)

            fe_model, optimizer = get_model(model_name=model_name, input_size=X_train.shape[1], 
                                              output_size=num_outputs, 
                                              target=target, 
                                              perc_numeric=perc_numeric, RS=RS)
            
            if dataset_name=="eucalyptus":
                optimizer.learning_rate.assign(optimizer.learning_rate*10)
        
        
            initial_stds = np.ones([len(qs),num_outputs]).astype(float).tolist()

            me_model = MixedEffectsNetwork(X_train, Z_train, y_train, fe_model, 
                                           target=target, qs=qs,
                                           initial_stds=initial_stds,
                                          fe_loss_weight=1.,
                                           mode="intercepts",
                                           early_stopping_fe=early_stopping,
                                          )    

            me_model.compile(
                loss_class_me = loss_use()(),
                loss_class_fe = loss_use()(),
            #     metric_class_me = tf.keras.metrics.AUC(multi_label=True, name="auc_me"),
            #     metric_class_fe = tf.keras.metrics.AUC(multi_label=True, name="auc_fe"),
                optimizer=optimizer
            )

            mcmc = MCMCSamplingCallback(num_mcmc_samples=1,
                                        perc_burnin=0.7,
                                        warm_restart=None,
                                        num_burnin_steps=1,
                                        step_size = 0.1#initial_step_size,
                                   )

            print_metric = PrintMetrics(X_train, Z_train, y_train, X_val, Z_val, y_val)

            start = time.time()
            history = me_model.fit([X_train,Z_train], y_train,
                         callbacks=[mcmc,
                                    print_metric,
                                    tf.keras.callbacks.EarlyStopping(monitor="me_auc_val", patience=early_stopping, mode="max")],
                         epochs=epochs,
                         validation_data=[[X_val,Z_val],y_val],
                        batch_size=batch_size)

            end = time.time()
            fit_time_gmenn = round(end-start,2)

            y_train_pred_gmenn, y_train_pred_gmenn_fe = me_model([X_train,Z_train])
            y_val_pred_gmenn, y_val_pred_gmenn_fe = me_model([X_val,Z_val])
            y_test_pred_gmenn, y_test_pred_gmenn_fe = me_model([X_test,Z_test])    

            
            ###### Prepare NN Training ######
            metrics_use = []
            if target =="binary":
                metrics_use.append(tf.keras.metrics.AUC(name="auc"))
                metrics_use.append(tf.keras.metrics.Accuracy(name="accuracy"))
                metrics_use.append(F1Score(num_classes=2, average="micro", name="f1"))
                stop_mode = "max"
                activation_layer = tf.keras.activations.sigmoid
            elif target =="categorical":
                metrics_use.append(tf.keras.metrics.AUC(multi_label=True, name="auc"))
                metrics_use.append(tf.keras.metrics.CategoricalAccuracy(name="accuracy"))
                metrics_use.append(F1Score(num_classes=num_outputs, average="weighted", name="f1"))
                stop_mode = "max"
                activation_layer = tf.keras.activations.softmax
            elif target == "continuous":
                metrics_use.append(RSquare(name="r2"))
                metrics_use.append(tf.keras.metrics.MeanSquaredError(name="mse"))
                stop_mode = "min"            
            
            ##### Ignore #####
            model_nn, optimizer = get_model(model_name=model_name, 
                                            input_size=X_train.shape[1], 
                                            output_size=num_outputs, 
                                            target=target, 
                                            perc_numeric=perc_numeric, RS=RS)
            if dataset_name=="eucalyptus":
                optimizer.learning_rate.assign(optimizer.learning_rate*10)

            model_nn.build((n,d))
            update_layer_activation(model=model_nn, activation=activation_layer)

            model_nn.compile(loss=loss_use()(), optimizer=optimizer, metrics = metrics_use)

            callback = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=early_stopping, mode=stop_mode)

            start = time.time()
            history_nn = model_nn.fit(X_train, y_train,
                         validation_data= [X_val, y_val],
                         epochs=epochs, batch_size=batch_size, callbacks=[callback])
            end = time.time()
            fit_time_nn = round(end-start,2)

            y_train_pred_nn = model_nn.predict(X_train ,batch_size=batch_size)
            y_val_pred_nn = model_nn.predict(X_val ,batch_size=batch_size)
            y_test_pred_nn = model_nn.predict(X_test ,batch_size=batch_size)

            if target == "binary":
                eval_res_train_nn = get_metrics(y_train[:,0], y_train_pred_nn, target=target)
                eval_res_val_nn = get_metrics(y_val[:,0], y_val_pred_nn, target=target)
                eval_res_test_nn = get_metrics(y_test[:,0], y_test_pred_nn, target=target)
            elif target == "categorical":
                eval_res_train_nn = get_metrics(y_train, y_train_pred_nn, target=target)
                eval_res_val_nn = get_metrics(y_val, y_val_pred_nn, target=target)
                eval_res_test_nn = get_metrics(y_test, y_test_pred_nn, target=target)

            ##### Target Encoding #####
            print("\n Train Target Encoding Network")
            model_nn_te, optimizer = get_model(model_name=model_name, 
                                            input_size=np.append(X_train ,z_target_encoded_train, axis=1).shape[1], 
                                            output_size=num_outputs, 
                                            target=target, 
                                            perc_numeric=perc_numeric, RS=RS)
            if dataset_name=="eucalyptus":
                optimizer.learning_rate.assign(optimizer.learning_rate*10)
            model_nn_te.build((n,np.append(X_train ,z_target_encoded_train, axis=1).shape[1]))
            update_layer_activation(model=model_nn_te, activation=activation_layer)
            model_nn_te.compile(loss=loss_use()(), optimizer=optimizer, metrics = metrics_use)
            callback = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=early_stopping, mode=stop_mode)

            start = time.time()
            history_nn_te = model_nn_te.fit(np.append(X_train ,z_target_encoded_train, axis=1), y_train,
                         validation_data= [np.append(X_val ,z_target_encoded_val, axis=1), y_val],
                         epochs=epochs, batch_size=batch_size, callbacks=[callback])
            end = time.time()
            fit_time_te = round(end-start,2)+target_encoding_time

            y_train_pred_nn_te = model_nn_te.predict(np.append(X_train ,z_target_encoded_train, axis=1) ,batch_size=batch_size)
            y_val_pred_nn_te = model_nn_te.predict(np.append(X_val ,z_target_encoded_val, axis=1) ,batch_size=batch_size)
            y_test_pred_nn_te = model_nn_te.predict(np.append(X_test ,z_target_encoded_test, axis=1) ,batch_size=batch_size)

            if target == "binary":
                eval_res_train_nn_te = get_metrics(y_train[:,0], y_train_pred_nn_te, target=target)
                eval_res_val_nn_te = get_metrics(y_val[:,0], y_val_pred_nn_te, target=target)
                eval_res_test_nn_te = get_metrics(y_test[:,0], y_test_pred_nn_te, target=target)
            elif target == "categorical":
                eval_res_train_nn_te = get_metrics(y_train, y_train_pred_nn_te, target=target)
                eval_res_val_nn_te = get_metrics(y_val, y_val_pred_nn_te, target=target)
                eval_res_test_nn_te = get_metrics(y_test, y_test_pred_nn_te, target=target)

            ##### OHE #####
            print("\n Train OHE Network")
            model_nn_ohe, optimizer = get_model(model_name=model_name, 
                                            input_size=np.append(X_train ,z_ohe_encoded_train, axis=1).shape[1], 
                                            output_size=num_outputs, 
                                            target=target, 
                                            perc_numeric=perc_numeric, RS=RS)
            if dataset_name=="eucalyptus":
                optimizer.learning_rate.assign(optimizer.learning_rate*10)
            model_nn_ohe.build((n,np.append(X_train ,z_ohe_encoded_train, axis=1).shape[1]))
            update_layer_activation(model=model_nn_ohe, activation=activation_layer)
            model_nn_ohe.compile(loss=loss_use()(), optimizer=optimizer, metrics = metrics_use)
            callback = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=early_stopping, mode=stop_mode)

            start = time.time()
            history_nn_ohe = model_nn_ohe.fit(np.append(X_train ,z_ohe_encoded_train, axis=1), y_train,
                         validation_data= [np.append(X_val ,z_ohe_encoded_val, axis=1), y_val],
                         epochs=epochs, batch_size=batch_size, callbacks=[callback])
            end = time.time()
            fit_time_ohe = round(end-start,2)+ohe_encoding_time

            y_train_pred_nn_ohe = model_nn_ohe.predict(np.append(X_train ,z_ohe_encoded_train, axis=1), batch_size=batch_size)
            y_val_pred_nn_ohe = model_nn_ohe.predict(np.append(X_val ,z_ohe_encoded_val, axis=1), batch_size=batch_size)
            y_test_pred_nn_ohe = model_nn_ohe.predict(np.append(X_test ,z_ohe_encoded_test, axis=1), batch_size=batch_size)
            
            if target == "binary":
                eval_res_train_nn_ohe = get_metrics(y_train[:,0], y_train_pred_nn_ohe, target=target)
                eval_res_val_nn_ohe = get_metrics(y_val[:,0], y_val_pred_nn_ohe, target=target)
                eval_res_test_nn_ohe = get_metrics(y_test[:,0], y_test_pred_nn_ohe, target=target)            
            elif target == "categorical":
                eval_res_train_nn_ohe = get_metrics(y_train, y_train_pred_nn_ohe, target=target)
                eval_res_val_nn_ohe = get_metrics(y_val, y_val_pred_nn_ohe, target=target)
                eval_res_test_nn_ohe = get_metrics(y_test, y_test_pred_nn_ohe, target=target)
                
            ##### Embedding #####
            print("\n Embedding Estimate Network")

            if embed_dims_method=="sqrt":
                embed_dims = [int(np.sqrt(q)) for q in qs]
            elif embed_dims_method=="AutoGluon":
                embed_dims = [int(np.max([100, np.round(1.6*q**0.56)])) for q in qs]
            else:
                embed_dims = [10 for q in qs]

            input_layer = Input(shape=(d,))

            # Define embedding layers
            embed_inputs = []
            embedding_layers = []
            for q_num in range(len(qs)):
                Z_input_layer = Input(shape=(1,))
                embedding_layer = Embedding(qs[q_num], embed_dims[q_num], input_length=1)(Z_input_layer)
                embedding_layer = Reshape(target_shape=(embed_dims[q_num],))(embedding_layer)

                embed_inputs.append(Z_input_layer)
                embedding_layers.append(embedding_layer)

            ### Get model layer dimensions
            min_numeric_embed_dim = 32
            max_numeric_embed_dim = 2056
            max_layer_width = 2056
            # Main dense model
            if target == "continuous":
                default_layer_sizes = [256,
                                       128]  # overall network will have 4 layers. Input layer, 256-unit hidden layer, 128-unit hidden layer, output layer.
            else:
                default_sizes = [256, 128]  # will be scaled adaptively
                # base_size = max(1, min(num_net_outputs, 20)/2.0) # scale layer width based on number of classes
                base_size = max(1, min(num_outputs,
                                       100) / 50)  # TODO: Updated because it improved model quality and made training far faster
                default_layer_sizes = [defaultsize * base_size for defaultsize in default_sizes]
            layer_expansion_factor = 1  # TODO: consider scaling based on num_rows, eg: layer_expansion_factor = 2-np.exp(-max(0,train_dataset.num_examples-10000))
            first_layer_width = int(min(max_layer_width, layer_expansion_factor * default_layer_sizes[0]))

            # numeric embed dim
            vector_dim = 0  # total dimensionality of vector features (I think those should be transformed string features, which we don't have)
            prop_vector_features = perc_numeric  # Fraction of features that are numeric
            numeric_embedding_size = int(min(max_numeric_embed_dim,
                                             max(min_numeric_embed_dim,
                                                 first_layer_width * prop_vector_features * np.log10(vector_dim + 10))))


            numeric_embedding = Dense(numeric_embedding_size, activation="relu")(input_layer)

            concat = Concatenate()([numeric_embedding] + embedding_layers)

            base_model, optimizer = get_model(model_name=model_name, 
                                              input_size=numeric_embedding_size + sum(embed_dims), 
                                              output_size=num_outputs, target=target,
                                              perc_numeric=perc_numeric, RS=RS)

            if dataset_name=="eucalyptus":
                optimizer.learning_rate.assign(optimizer.learning_rate*10)
            base_model.build((n, numeric_embedding_size + sum(embed_dims)))
            update_layer_activation(model=base_model, activation=activation_layer)

            layers = base_model(concat)

            model_embed = Model(inputs=[input_layer] + embed_inputs, outputs=layers)


            model_embed.compile(loss=loss_use()(), optimizer=optimizer, metrics = metrics_use)
            callback = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=early_stopping, mode=stop_mode)

            start = time.time()
            history_nn_embed = model_embed.fit([X_train] + [Z_train[: ,q_num] for q_num in range(len(qs))], y_train,
                            validation_data=[[X_val] + [Z_val[: ,q_num] for q_num in range(len(qs))], y_val],
                            epochs=epochs, batch_size=batch_size, callbacks=[callback])
            end = time.time()
            fit_time_embed = round(end-start,2)

            y_train_pred_embed = model_embed.predict([X_train] + [Z_train[: ,q_num] for q_num in range(len(qs))]
                                                     ,batch_size=batch_size)
            y_val_pred_embed = model_embed.predict([X_val] + [Z_val[: ,q_num] for q_num in range(len(qs))]
                                                    ,batch_size=batch_size)
            y_test_pred_embed = model_embed.predict([X_test] + [Z_test[: ,q_num] for q_num in range(len(qs))]
                                                    ,batch_size=batch_size)

            if target == "binary":
                eval_res_train_embed = get_metrics(y_train[:,0], y_train_pred_embed, target=target)
                eval_res_val_embed = get_metrics(y_val[:,0], y_val_pred_embed, target=target)
                eval_res_test_embed = get_metrics(y_test[:,0], y_test_pred_embed, target=target)
            elif target == "categorical":
                eval_res_train_embed = get_metrics(y_train, y_train_pred_embed, target=target)
                eval_res_val_embed = get_metrics(y_val, y_val_pred_embed, target=target)
                eval_res_test_embed = get_metrics(y_test, y_test_pred_embed, target=target)

            eval_res_train_embed, eval_res_test_embed        



            ##### Document Results #####
            
            results[dataset_name][fold_num]["histories"] = {"GMENN": history.history,
                                                       "Ignore": history_nn.history,
                                                       "TE": history_nn_te.history,
                                                       "OHE": history_nn_ohe.history,
                                                       "Embedding": history_nn_embed.history,
                                                      }
            
            results[dataset_name][fold_num]["predictions"] = {"GMENN": [y_train_pred_gmenn, y_val_pred_gmenn, y_test_pred_gmenn],
                                                        "GMENN (FE)": [y_train_pred_gmenn_fe, y_val_pred_gmenn_fe, y_test_pred_gmenn_fe],
                                                        "Ignore": [y_train_pred_nn, y_val_pred_nn, y_test_pred_nn],
                                                        "TE": [y_train_pred_nn_te, y_val_pred_nn_te, y_test_pred_nn_te],
                                                        "OHE": [y_train_pred_nn_ohe, y_val_pred_nn_ohe, y_test_pred_nn_ohe],
                                                        "Embedding": [y_train_pred_embed, y_val_pred_embed, y_test_pred_embed],
                                                     }
            
            results[dataset_name][fold_num]["times"] = {"GMENN": fit_time_gmenn,
                                                   "Ignore": fit_time_nn,
                                                   "TE": fit_time_te,
                                                   "OHE": fit_time_ohe,
                                                   "Embedding": fit_time_embed,
                                                      }
            
            results[dataset_name][fold_num]["other_info"] = {
                "GMENN": {
                    "_stddev_z": np.array([i.numpy() for i in me_model.data_model._stddev_z]),
                    "acceptance_rates": np.array(me_model.acceptance_rates),
                    "random_effects": me_model.mean_samples,
                    "all_samples": me_model.all_samples,
                    "stds": me_model.stds
                },
            }
            
            
            with open(f"{save_path}//results_RS{RS}_{dataset_name}_iter{fold_num}.pickle", 'wb') as handle:
                pickle.dump(results[dataset_name][fold_num], handle, protocol=pickle.HIGHEST_PROTOCOL)
            
            
            del X_train, X_val, X_test, y_train, y_val, y_test
            del z_target_encoded_train, z_target_encoded_val, z_target_encoded_test
            del z_ohe_encoded_train, z_ohe_encoded_val, z_ohe_encoded_test
            
            gc.collect()
        else:
            print(f"Load results for dataset {dataset_name}, iteration={fold_num}")
            with open(f"{save_path}/results_RS{RS}_{dataset_name}_iter{fold_num}.pickle", 'rb') as handle:
                results[dataset_name][fold_num] = pickle.load(handle)
        


Start training procedure for eucalyptus
Fold no. 0
Load results for dataset eucalyptus, iteration=0
Fold no. 1
Load results for dataset eucalyptus, iteration=1
Fold no. 2
Load results for dataset eucalyptus, iteration=2
Fold no. 3
Load results for dataset eucalyptus, iteration=3
Fold no. 4
Load results for dataset eucalyptus, iteration=4
Start training procedure for Midwest_survey
Fold no. 0
Load results for dataset Midwest_survey, iteration=0
Fold no. 1
Load results for dataset Midwest_survey, iteration=1
Fold no. 2
Load results for dataset Midwest_survey, iteration=2
Fold no. 3
Load results for dataset Midwest_survey, iteration=3
Fold no. 4
Load results for dataset Midwest_survey, iteration=4
Start training procedure for hpc-job-scheduling
Fold no. 0
Load results for dataset hpc-job-scheduling, iteration=0
Fold no. 1
Load results for dataset hpc-job-scheduling, iteration=1
Fold no. 2
Load results for dataset hpc-job-scheduling, iteration=2
Fold no. 3
Load results for dataset hpc-job-

## Evaluation

### Performance

In [3]:
models = ["GMENN", "TE", "OHE", "Embedding","Ignore"]

results_perf = {dataset_name: {num: {model: {}  for model in models} for num in range(folds)} for dataset_name in dataset_names}
for dataset_name in dataset_names:
    try:
        with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
            data_dict = pickle.load(handle)        
    except:
        print(f"dataset {dataset_name} not found") 
    for num in range(folds):
        y_test = data_dict[f"y_test_{num}"]
        n_classes = np.unique(y_test).shape[0]
        y_test = tf.one_hot(data_dict[f"y_test_{num}"],n_classes)
        for model in models:
            try:
                y_pred = results[dataset_name][num]["predictions"][model][2]

                results_perf[dataset_name][num][model] = get_metrics(y_test,y_pred,target)
                results_perf[dataset_name][num][model]["Time"] = results[dataset_name][num]["times"][model]
            except:
                print(f"Set nan for {dataset_name}, {num}")
                results_perf[dataset_name][num][model] = {"Accuracy": np.nan,
                                                          "AUROC": np.nan,
                                                          "F1": np.nan,
                                                          "Time": np.nan}
#                 print(f"Didnt work for {dataset_name}, {num}")


2024-01-18 17:28:03.559391: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [4]:
models = ["GMENN", "TE", "OHE", "Embedding", "Ignore"]

metric = "AUROC"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

round_mean_at = 2
round_std_at = 3

for dataset_name in dataset_names:
    dataset_models = list(results_perf[dataset_name][0].keys())
    use_df = pd.DataFrame([pd.DataFrame(results_perf[dataset_name][fold_num]).loc[metric,models] for fold_num in results_perf[dataset_name].keys()],index=results_perf[dataset_name].keys())
    
    df_mean = pd.DataFrame(use_df.mean(axis=0).round(round_mean_at).astype(str) + " (" + use_df.std(axis=0).round(round_std_at).astype(str) + ")").transpose()
    model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}
    dataset_res_dict[dataset_name] = model_dict
    
    best_models[dataset_name] = use_df.columns[use_df.mean(axis=0).argmax()]

    t_test_res = np.array([stats.ttest_rel(use_df[best_models[dataset_name]].values, use_df[model].values)[1] if model in dataset_models else 0 for model in models]).round(3)
    t_test_res[np.isnan(t_test_res)] = 1.
    t_test_results[dataset_name] = t_test_res
    
res_df = pd.DataFrame(dataset_res_dict).transpose()
    
def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_results[dataset_name][i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

# res_df.style.apply(negative_bold)
res_df

Unnamed: 0,GMENN,TE,OHE,Embedding,Ignore
eucalyptus,0.89 (0.021),0.89 (0.021),0.9 (0.026),0.91 (0.022),0.91 (0.021)
Midwest_survey,0.84 (0.024),0.76 (0.008),0.8 (0.013),0.85 (0.015),0.78 (0.009)
hpc-job-scheduling,0.88 (0.009),0.81 (0.021),0.89 (0.009),0.91 (0.01),0.79 (0.01)
video-game-sales,0.77 (0.008),0.63 (0.016),0.76 (0.005),0.77 (0.01),0.67 (0.01)
okcupid-stem,0.8 (0.005),0.67 (0.017),0.82 (0.004),0.81 (0.005),0.74 (0.005)
Diabetes130US,0.67 (0.007),0.63 (0.008),0.68 (0.003),0.68 (0.003),0.64 (0.005)


In [5]:
res_df.apply(negative_bold)

Unnamed: 0,GMENN,TE,OHE,Embedding,Ignore
eucalyptus,,,font-weight: bold,font-weight: bold,font-weight: bold
Midwest_survey,font-weight: bold,,,font-weight: bold,
hpc-job-scheduling,,,,font-weight: bold,
video-game-sales,font-weight: bold,,,font-weight: bold,
okcupid-stem,,,font-weight: bold,,
Diabetes130US,,,font-weight: bold,,


In [6]:
res_df.columns = ["GMENN", "Ignore",  "TE", "OHE", "Embedding"]
print(res_df.to_latex(index=True))
# res_df

\begin{tabular}{llllll}
\toprule
{} &         GMENN &        Ignore &            TE &           OHE &     Embedding \\
\midrule
eucalyptus         &  0.89 (0.021) &  0.89 (0.021) &   0.9 (0.026) &  0.91 (0.022) &  0.91 (0.021) \\
Midwest\_survey     &  0.84 (0.024) &  0.76 (0.008) &   0.8 (0.013) &  0.85 (0.015) &  0.78 (0.009) \\
hpc-job-scheduling &  0.88 (0.009) &  0.81 (0.021) &  0.89 (0.009) &   0.91 (0.01) &   0.79 (0.01) \\
video-game-sales   &  0.77 (0.008) &  0.63 (0.016) &  0.76 (0.005) &   0.77 (0.01) &   0.67 (0.01) \\
okcupid-stem       &   0.8 (0.005) &  0.67 (0.017) &  0.82 (0.004) &  0.81 (0.005) &  0.74 (0.005) \\
Diabetes130US      &  0.67 (0.007) &  0.63 (0.008) &  0.68 (0.003) &  0.68 (0.003) &  0.64 (0.005) \\
\bottomrule
\end{tabular}



In [7]:
dataset_df = {}
for dataset_name in dataset_names:
    dataset_models = list(results_perf[dataset_name][0].keys())
    use_df = pd.DataFrame([pd.DataFrame(results_perf[dataset_name][fold_num]).loc[metric,models] for fold_num in results_perf[dataset_name].keys()],index=results_perf[dataset_name].keys())
    dataset_df[dataset_name] = use_df
    
mean_df = pd.DataFrame({dataset_name: dataset_df[dataset_name].mean(axis=0) for dataset_name in dataset_names})

# Mean reciprocal rank
print(np.mean((1/mean_df.rank(axis=0,ascending=False)),axis=1).round(2))

# Mean rank
print(np.mean((mean_df.rank(axis=0,ascending=False)),axis=1).round(2))

# Average distance to best
print(mean_df.apply(lambda x: np.abs(x-np.max(x)),axis=0).mean(axis=1).round(4))

mean_df

GMENN        0.46
TE           0.21
OHE          0.58
Embedding    0.75
Ignore       0.28
dtype: float64
GMENN        2.67
TE           4.83
OHE          2.17
Embedding    1.50
Ignore       3.83
dtype: float64
GMENN        0.0146
TE           0.0921
OHE          0.0152
Embedding    0.0015
Ignore       0.0679
dtype: float64


Unnamed: 0,eucalyptus,Midwest_survey,hpc-job-scheduling,video-game-sales,okcupid-stem,Diabetes130US
GMENN,0.894152,0.838486,0.881305,0.773724,0.799715,0.668554
TE,0.890051,0.764253,0.807443,0.627644,0.667454,0.634482
OHE,0.90374,0.804891,0.886275,0.762067,0.815571,0.680065
Embedding,0.913796,0.854892,0.905643,0.772525,0.809206,0.678419
Ignore,0.90798,0.779084,0.791452,0.673818,0.740299,0.64351


In [8]:
np.array(mean_df)

array([[0.8941517 , 0.83848622, 0.88130544, 0.77372364, 0.79971493,
        0.66855443],
       [0.89005096, 0.76425345, 0.80744309, 0.62764355, 0.66745428,
        0.63448187],
       [0.90373965, 0.80489136, 0.88627461, 0.76206654, 0.81557086,
        0.68006545],
       [0.91379603, 0.85489233, 0.90564332, 0.77252502, 0.80920638,
        0.67841862],
       [0.90798047, 0.77908409, 0.79145216, 0.67381812, 0.74029944,
        0.64351008]])

In [9]:
3e-4, 1e-6

(0.0003, 1e-06)

In [10]:
print(pd.DataFrame(np.mean((1/mean_df.rank(axis=0,ascending=False)),axis=1)).transpose().round(2).to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  GMENN &    TE &   OHE &  Embedding &  Ignore \\
\midrule
0 &   0.46 &  0.21 &  0.58 &       0.75 &    0.28 \\
\bottomrule
\end{tabular}



### Time

In [11]:
models = ["GMENN", "TE", "OHE", "Embedding", "Ignore"]
metric = "Time"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

round_mean_at = 2
round_std_at = 3

for dataset_name in dataset_names:
    dataset_models = list(results_perf[dataset_name][0].keys())
    use_df = pd.DataFrame([pd.DataFrame(results_perf[dataset_name][fold_num]).loc[metric,models] for fold_num in results_perf[dataset_name].keys()],index=results_perf[dataset_name].keys())/60
    
    df_mean = pd.DataFrame(use_df.mean(axis=0).round(round_mean_at).astype(str) + " (" + use_df.std(axis=0).round(round_std_at).astype(str) + ")").transpose()
    model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}
    dataset_res_dict[dataset_name] = model_dict
    
    best_models[dataset_name] = use_df.columns[use_df.mean(axis=0).argmin()]

    t_test_res = np.array([stats.ttest_rel(use_df[best_models[dataset_name]].values, use_df[model].values)[1] if model in dataset_models else 0 for model in models]).round(3)
    t_test_res[np.isnan(t_test_res)] = 1.
    t_test_results[dataset_name] = t_test_res
    
res_df = pd.DataFrame(dataset_res_dict).transpose()
    
def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_results[dataset_name][i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

# res_df.style.apply(negative_bold)
res_df

Unnamed: 0,GMENN,TE,OHE,Embedding,Ignore
eucalyptus,16.39 (17.547),1.3 (0.273),1.63 (0.297),1.15 (0.132),1.57 (0.208)
Midwest_survey,11.08 (4.698),2.85 (0.446),3.28 (0.47),3.88 (0.398),3.45 (0.423)
hpc-job-scheduling,30.73 (40.96),1.82 (0.153),3.11 (0.825),2.45 (0.664),2.2 (0.532)
video-game-sales,5.36 (1.804),1.71 (0.602),6.26 (0.493),3.85 (1.098),4.7 (1.274)
okcupid-stem,8.73 (1.406),1.06 (0.121),5.31 (0.738),2.09 (0.171),7.2 (1.358)
Diabetes130US,21.97 (7.297),1.64 (0.467),3.16 (0.543),2.11 (0.168),6.33 (0.869)


In [12]:
t_test_results

{'eucalyptus': array([0.123, 0.167, 0.012, 1.   , 0.003]),
 'Midwest_survey': array([0.02 , 1.   , 0.267, 0.049, 0.03 ]),
 'hpc-job-scheduling': array([0.189, 1.   , 0.016, 0.083, 0.192]),
 'video-game-sales': array([0.018, 1.   , 0.   , 0.042, 0.005]),
 'okcupid-stem': array([0.   , 1.   , 0.   , 0.   , 0.001]),
 'Diabetes130US': array([0.003, 1.   , 0.012, 0.025, 0.   ])}

In [13]:
res_df.apply(negative_bold)

Unnamed: 0,GMENN,TE,OHE,Embedding,Ignore
eucalyptus,font-weight: bold,font-weight: bold,,font-weight: bold,
Midwest_survey,,font-weight: bold,font-weight: bold,,
hpc-job-scheduling,font-weight: bold,font-weight: bold,,font-weight: bold,font-weight: bold
video-game-sales,,font-weight: bold,,,
okcupid-stem,,font-weight: bold,,,
Diabetes130US,,font-weight: bold,,,


In [14]:
res_df.columns = ["MC-GMENN", "TE", "OHE", "Embedding", "Ignore"]
print(res_df.to_latex(index=True))
res_df

\begin{tabular}{llllll}
\toprule
{} &        MC-GMENN &            TE &           OHE &     Embedding &        Ignore \\
\midrule
eucalyptus         &  16.39 (17.547) &   1.3 (0.273) &  1.63 (0.297) &  1.15 (0.132) &  1.57 (0.208) \\
Midwest\_survey     &   11.08 (4.698) &  2.85 (0.446) &   3.28 (0.47) &  3.88 (0.398) &  3.45 (0.423) \\
hpc-job-scheduling &   30.73 (40.96) &  1.82 (0.153) &  3.11 (0.825) &  2.45 (0.664) &   2.2 (0.532) \\
video-game-sales   &    5.36 (1.804) &  1.71 (0.602) &  6.26 (0.493) &  3.85 (1.098) &   4.7 (1.274) \\
okcupid-stem       &    8.73 (1.406) &  1.06 (0.121) &  5.31 (0.738) &  2.09 (0.171) &   7.2 (1.358) \\
Diabetes130US      &   21.97 (7.297) &  1.64 (0.467) &  3.16 (0.543) &  2.11 (0.168) &  6.33 (0.869) \\
\bottomrule
\end{tabular}



Unnamed: 0,MC-GMENN,TE,OHE,Embedding,Ignore
eucalyptus,16.39 (17.547),1.3 (0.273),1.63 (0.297),1.15 (0.132),1.57 (0.208)
Midwest_survey,11.08 (4.698),2.85 (0.446),3.28 (0.47),3.88 (0.398),3.45 (0.423)
hpc-job-scheduling,30.73 (40.96),1.82 (0.153),3.11 (0.825),2.45 (0.664),2.2 (0.532)
video-game-sales,5.36 (1.804),1.71 (0.602),6.26 (0.493),3.85 (1.098),4.7 (1.274)
okcupid-stem,8.73 (1.406),1.06 (0.121),5.31 (0.738),2.09 (0.171),7.2 (1.358)
Diabetes130US,21.97 (7.297),1.64 (0.467),3.16 (0.543),2.11 (0.168),6.33 (0.869)


In [15]:
dataset_df = {}
for dataset_name in dataset_names:
    dataset_models = list(results_perf[dataset_name][0].keys())
    use_df = pd.DataFrame([pd.DataFrame(results_perf[dataset_name][fold_num]).loc[metric,models] for fold_num in results_perf[dataset_name].keys()],index=results_perf[dataset_name].keys())/60
    dataset_df[dataset_name] = use_df
    
mean_df = pd.DataFrame({dataset_name: dataset_df[dataset_name].mean(axis=0) for dataset_name in dataset_names})#*-1

# Mean reciprocal rank
print(np.mean((1/(mean_df*-1).rank(axis=0,ascending=False)),axis=1).round(2))

# Mean rank
print(np.mean(((mean_df*-1).rank(axis=0,ascending=False)),axis=1).round(2))

# Average distance to best
# In %
print(mean_df.apply(lambda x: (-100*(np.min(x)-x)/np.min(x)),axis=0).mean(axis=1).round(4))

# absolute
print(mean_df.apply(lambda x: np.abs(x-np.min(x)),axis=0).mean(axis=1).round(4))

mean_df

GMENN        0.21
TE           0.92
OHE          0.31
Embedding    0.51
Ignore       0.33
dtype: float64
GMENN        4.83
TE           1.17
OHE          3.50
Embedding    2.33
Ignore       3.17
dtype: float64
GMENN        897.6621
TE             2.2786
OHE          147.7971
Embedding     53.5333
Ignore       186.1027
dtype: float64
GMENN        14.0079
TE            0.0261
OHE           2.0885
Embedding     0.8829
Ignore        2.5362
dtype: float64


Unnamed: 0,eucalyptus,Midwest_survey,hpc-job-scheduling,video-game-sales,okcupid-stem,Diabetes130US
GMENN,16.394967,11.081767,30.734,5.361667,8.731633,21.970533
TE,1.303432,2.853222,1.816757,1.70894,1.064785,1.636771
OHE,1.630178,3.284085,3.107877,6.26356,5.311045,3.161266
Embedding,1.146667,3.8831,2.446967,3.8487,2.0923,2.106633
Ignore,1.571267,3.448433,2.199333,4.6994,7.199933,6.326


In [16]:
print(pd.DataFrame(np.mean((1/(-1*mean_df).rank(axis=0,ascending=False)),axis=1).round(2)).transpose().to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  GMENN &    TE &   OHE &  Embedding &  Ignore \\
\midrule
0 &   0.21 &  0.92 &  0.31 &       0.51 &    0.33 \\
\bottomrule
\end{tabular}



In [17]:
print(pd.DataFrame(mean_df.apply(lambda x: np.abs(x-np.min(x)),axis=0).mean(axis=1).round(2)).transpose().to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &  GMENN &    TE &   OHE &  Embedding &  Ignore \\
\midrule
0 &  14.01 &  0.03 &  2.09 &       0.88 &    2.54 \\
\bottomrule
\end{tabular}



### Learned variance

In [18]:
tf.random.set_seed(RS)
stds_all = {}
re_all = {}
sig_df_dict = {}
z_col_dict = {}

for dataset_name in dataset_names:    
    stds_all[dataset_name] = {}
    re_all[dataset_name] = {}

    print(f"Load results for {dataset_name}")
    data_path = f"{mode}_RS{RS}_hct{hct}"
    if mode == "cv":
        data_path += f"_{folds}folds"
    elif mode == "train_test":
        data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
    elif mode == "train_val_test":
        data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"

    with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)

        Z_train = data_dict[f"Z_train_{fold_num}"]
        Z_val = data_dict[f"Z_val_{fold_num}"]
        Z_test = data_dict[f"Z_test_{fold_num}"]
        
        y_train = data_dict[f"y_train_{fold_num}"]
        y_val = data_dict[f"y_val_{fold_num}"]
        y_test = data_dict[f"y_test_{fold_num}"]
    
        qs = np.max([tf.reduce_max(Z_train, axis=0),tf.reduce_max(Z_val, axis=0),tf.reduce_max(Z_test, axis=0)],axis=0)+1
        
        
    z_col_dict[dataset_name] = [col + f" (Q={qs[num]})" for num, col in enumerate(data_dict["z_cols"])]
    for fold_num in range(folds):
        y_train = data_dict[f"y_train_{fold_num}"]

        random_effects = results[dataset_name][fold_num]["other_info"]["GMENN"]["random_effects"]
        learned_stds = results[dataset_name][fold_num]["other_info"]["GMENN"]["_stddev_z"]**2

        stds_all[dataset_name][fold_num] = learned_stds
        re_all[dataset_name][fold_num] = random_effects
        
    df_std_mean = pd.DataFrame(np.array(list(stds_all[dataset_name].values())).mean(axis=0).round(2)).astype(str).transpose()
    df_std_std = pd.DataFrame(np.array(list(stds_all[dataset_name].values())).mean(axis=0).round(3)).astype(str).transpose()

    sig_df_dict[dataset_name] = df_std_mean+" (" + df_std_std + ")"


Load results for eucalyptus
Load results for Midwest_survey
Load results for hpc-job-scheduling
Load results for video-game-sales
Load results for okcupid-stem
Load results for Diabetes130US


In [19]:
# y_cols_raw = {"eucalyptus": "Utility",
#               "Midwest_survey": "Location..Census.Region.",
#               "hpc-job-scheduling":"Class",
#               "video-game-sales": "Genre",
#               "okcupid-stem": "job",
#               "Diabetes130US": "readmitted"}
# u,c = np.unique(pd.read_csv(f"../data/raw/{dataset_name}/{dataset_name}.csv")[y_cols_raw[dataset_name]],return_counts=True)



In [20]:
def get_latex_df(sig_df_dict,dataset_name):
    df_stds = pd.DataFrame(sig_df_dict[dataset_name]).transpose()
    df_stds.index=z_col_dict[dataset_name]
    df_stds.columns=[f"$c={i+1}$" for i in range(sig_df_dict[dataset_name].shape[0])]
    return df_stds


In [21]:
dataset_name = "eucalyptus"
print(get_latex_df(sig_df_dict,dataset_name).to_latex(index=True))
get_latex_df(sig_df_dict,dataset_name)

\begin{tabular}{llllll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ &         \$c=4\$ &         \$c=5\$ \\
\midrule
Abbrev (Q=16)   &  0.01 (0.013) &  0.06 (0.062) &  0.36 (0.358) &   0.0 (0.004) &  0.71 (0.713) \\
Map\_Ref (Q=14)  &  0.28 (0.276) &  0.16 (0.165) &  0.02 (0.015) &   0.0 (0.004) &  0.37 (0.366) \\
Latitude (Q=12) &   0.05 (0.05) &  0.03 (0.026) &  0.02 (0.017) &   0.0 (0.002) &  0.06 (0.055) \\
Sp (Q=27)       &   0.2 (0.198) &  0.06 (0.062) &   0.02 (0.02) &  0.14 (0.139) &   0.1 (0.095) \\
\bottomrule
\end{tabular}



Unnamed: 0,$c=1$,$c=2$,$c=3$,$c=4$,$c=5$
Abbrev (Q=16),0.01 (0.013),0.06 (0.062),0.36 (0.358),0.0 (0.004),0.71 (0.713)
Map_Ref (Q=14),0.28 (0.276),0.16 (0.165),0.02 (0.015),0.0 (0.004),0.37 (0.366)
Latitude (Q=12),0.05 (0.05),0.03 (0.026),0.02 (0.017),0.0 (0.002),0.06 (0.055)
Sp (Q=27),0.2 (0.198),0.06 (0.062),0.02 (0.02),0.14 (0.139),0.1 (0.095)


In [22]:
dataset_name = "Midwest_survey"
df_stds = get_latex_df(sig_df_dict,dataset_name)
df_stds.index = ["Question"+df_stds.index[0].split(" ")[1]]
# pd.concat({dataset_name: pd.DataFrame([df_stds.iloc[:,:5],df_stds.iloc[:,5:]])})
print(df_stds.iloc[:,:5].to_latex(index=True))
print(df_stds.iloc[:,5:].to_latex(index=True))
df_stds

\begin{tabular}{llllll}
\toprule
{} &        \$c=1\$ &        \$c=2\$ &         \$c=3\$ &         \$c=4\$ &         \$c=5\$ \\
\midrule
Question(Q=677) &  2.04 (2.04) &  0.69 (0.69) &  2.18 (2.179) &  3.13 (3.125) &  1.43 (1.428) \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &        \$c=6\$ &         \$c=7\$ &        \$c=8\$ &         \$c=9\$ &        \$c=10\$ \\
\midrule
Question(Q=677) &  1.0 (0.998) &  3.06 (3.059) &  2.2 (2.197) &  2.65 (2.651) &  3.99 (3.991) \\
\bottomrule
\end{tabular}



Unnamed: 0,$c=1$,$c=2$,$c=3$,$c=4$,$c=5$,$c=6$,$c=7$,$c=8$,$c=9$,$c=10$
Question(Q=677),2.04 (2.04),0.69 (0.69),2.18 (2.179),3.13 (3.125),1.43 (1.428),1.0 (0.998),3.06 (3.059),2.2 (2.197),2.65 (2.651),3.99 (3.991)


In [23]:
dataset_name = "hpc-job-scheduling"
print(get_latex_df(sig_df_dict,dataset_name).to_latex(index=True))
get_latex_df(sig_df_dict,dataset_name)

\begin{tabular}{lllll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ &         \$c=4\$ \\
\midrule
Protocol (Q=14) &  0.41 (0.411) &  0.51 (0.513) &  0.04 (0.036) &  4.66 (4.662) \\
\bottomrule
\end{tabular}



Unnamed: 0,$c=1$,$c=2$,$c=3$,$c=4$
Protocol (Q=14),0.41 (0.411),0.51 (0.513),0.04 (0.036),4.66 (4.662)


In [24]:
dataset_name = "video-game-sales"
df_stds = get_latex_df(sig_df_dict,dataset_name)
# pd.concat({dataset_name: pd.DataFrame([df_stds.iloc[:,:5],df_stds.iloc[:,5:]])})
print(df_stds.iloc[:1,:5].to_latex(index=True))
print(df_stds.iloc[:1,5:10].to_latex(index=True))
print(df_stds.iloc[:1,10:].to_latex(index=True))
print(df_stds.iloc[1:,:5].to_latex(index=True))
print(df_stds.iloc[1:,5:10].to_latex(index=True))
print(df_stds.iloc[1:,10:].to_latex(index=True))
df_stds

\begin{tabular}{llllll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ &         \$c=4\$ &         \$c=5\$ \\
\midrule
Platform (Q=32) &  0.44 (0.443) &  0.28 (0.276) &  0.24 (0.235) &  0.29 (0.293) &  0.39 (0.385) \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &         \$c=6\$ &         \$c=7\$ &         \$c=8\$ &         \$c=9\$ &        \$c=10\$ \\
\midrule
Platform (Q=32) &  0.59 (0.588) &  0.06 (0.064) &  0.15 (0.149) &  0.65 (0.653) &  0.45 (0.445) \\
\bottomrule
\end{tabular}

\begin{tabular}{lll}
\toprule
{} &        \$c=11\$ &     \$c=12\$ \\
\midrule
Platform (Q=32) &  0.14 (0.139) &  0.5 (0.5) \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ &        \$c=4\$ &         \$c=5\$ \\
\midrule
Publisher (Q=478) &  0.81 (0.815) &  2.33 (2.334) &  1.12 (1.118) &  1.61 (1.61) &  0.73 (0.733) \\
\bottomrule
\end{tabular}

\begin{tabular}{llllll}
\toprule
{} &        \$c=6\$ & 

Unnamed: 0,$c=1$,$c=2$,$c=3$,$c=4$,$c=5$,$c=6$,$c=7$,$c=8$,$c=9$,$c=10$,$c=11$,$c=12$
Platform (Q=32),0.44 (0.443),0.28 (0.276),0.24 (0.235),0.29 (0.293),0.39 (0.385),0.59 (0.588),0.06 (0.064),0.15 (0.149),0.65 (0.653),0.45 (0.445),0.14 (0.139),0.5 (0.5)
Publisher (Q=478),0.81 (0.815),2.33 (2.334),1.12 (1.118),1.61 (1.61),0.73 (0.733),1.17 (1.17),1.34 (1.341),2.11 (2.108),0.68 (0.683),0.91 (0.91),2.55 (2.545),0.91 (0.908)


In [25]:
dataset_name = "okcupid-stem"
print(get_latex_df(sig_df_dict,dataset_name).to_latex(index=True))
get_latex_df(sig_df_dict,dataset_name)

\begin{tabular}{llll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ \\
\midrule
body\_type (Q=13)  &  0.03 (0.029) &  0.01 (0.012) &  0.04 (0.045) \\
diet (Q=19)       &  0.02 (0.021) &  0.02 (0.017) &  0.06 (0.061) \\
education (Q=33)  &  0.24 (0.241) &   0.8 (0.799) &  3.91 (3.907) \\
ethnicity (Q=186) &  0.27 (0.265) &  0.27 (0.275) &  0.53 (0.527) \\
location (Q=149)  &  0.16 (0.164) &  0.17 (0.172) &  0.31 (0.307) \\
offspring (Q=16)  &  0.01 (0.007) &  0.02 (0.021) &  0.21 (0.206) \\
pets (Q=16)       &  0.02 (0.024) &  0.08 (0.076) &  0.02 (0.023) \\
religion (Q=46)   &  0.03 (0.031) &  0.08 (0.079) &  0.06 (0.058) \\
sign (Q=49)       &  0.02 (0.018) &  0.03 (0.026) &  0.04 (0.039) \\
speaks (Q=4718)   &  0.81 (0.813) &  0.89 (0.892) &  1.02 (1.024) \\
\bottomrule
\end{tabular}



Unnamed: 0,$c=1$,$c=2$,$c=3$
body_type (Q=13),0.03 (0.029),0.01 (0.012),0.04 (0.045)
diet (Q=19),0.02 (0.021),0.02 (0.017),0.06 (0.061)
education (Q=33),0.24 (0.241),0.8 (0.799),3.91 (3.907)
ethnicity (Q=186),0.27 (0.265),0.27 (0.275),0.53 (0.527)
location (Q=149),0.16 (0.164),0.17 (0.172),0.31 (0.307)
offspring (Q=16),0.01 (0.007),0.02 (0.021),0.21 (0.206)
pets (Q=16),0.02 (0.024),0.08 (0.076),0.02 (0.023)
religion (Q=46),0.03 (0.031),0.08 (0.079),0.06 (0.058)
sign (Q=49),0.02 (0.018),0.03 (0.026),0.04 (0.039)
speaks (Q=4718),0.81 (0.813),0.89 (0.892),1.02 (1.024)


In [26]:
dataset_name = "Diabetes130US"
print(get_latex_df(sig_df_dict,dataset_name).to_latex(index=True))
get_latex_df(sig_df_dict,dataset_name)

\begin{tabular}{llll}
\toprule
{} &         \$c=1\$ &         \$c=2\$ &         \$c=3\$ \\
\midrule
age (Q=10)                      &   0.0 (0.003) &   0.0 (0.002) &     0.0 (0.0) \\
discharge\_disposition\_id (Q=26) &  2.57 (2.569) &   1.0 (1.005) &  0.13 (0.131) \\
admission\_source\_id (Q=17)      &  0.06 (0.064) &  0.05 (0.047) &  0.14 (0.137) \\
payer\_code (Q=19)               &  0.03 (0.032) &  0.01 (0.007) &   0.0 (0.002) \\
medical\_specialty (Q=71)        &  0.02 (0.025) &   0.04 (0.04) &   0.02 (0.02) \\
diag\_1 (Q=674)                  &  0.06 (0.062) &  0.12 (0.118) &  0.06 (0.061) \\
diag\_2 (Q=684)                  &  0.06 (0.058) &  0.09 (0.088) &   0.06 (0.06) \\
diag\_3 (Q=732)                  &  0.06 (0.064) &  0.08 (0.082) &  0.06 (0.058) \\
\bottomrule
\end{tabular}



Unnamed: 0,$c=1$,$c=2$,$c=3$
age (Q=10),0.0 (0.003),0.0 (0.002),0.0 (0.0)
discharge_disposition_id (Q=26),2.57 (2.569),1.0 (1.005),0.13 (0.131)
admission_source_id (Q=17),0.06 (0.064),0.05 (0.047),0.14 (0.137)
payer_code (Q=19),0.03 (0.032),0.01 (0.007),0.0 (0.002)
medical_specialty (Q=71),0.02 (0.025),0.04 (0.04),0.02 (0.02)
diag_1 (Q=674),0.06 (0.062),0.12 (0.118),0.06 (0.061)
diag_2 (Q=684),0.06 (0.058),0.09 (0.088),0.06 (0.06)
diag_3 (Q=732),0.06 (0.064),0.08 (0.082),0.06 (0.058)
