## Notebook Overview
- Load the combined dataset and configure train/test splits for multiple cohort variants.
- Run `retraining_model_experiment` to produce paired predictions for baseline and balanced setups.
- Save probability archives, masks, and summary tables to `experiments_data/`.
- Generate random one-hot cohorts to validate class proportion controls.
- Evaluate the random-control models and persist their prediction statistics for later comparison.


# Notebook Description
Notebook used for generating the many reruns data and saving it for all data types.

In [None]:
from helper_functions.notebook_utils import balance_classes, generate_one_hot, retraining_model_experiment


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

import tensorflow as tf

from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_rel
import seaborn as sns


from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline

from keras.optimizers import Adam
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.initializers import GlorotUniform

import keras.backend as K
from sklearn.calibration import calibration_curve

import h5py
import shutil

import os

In [None]:
save_dir = '/content/drive/MyDrive/Data/'

save_path = os.path.join(save_dir, 'data_variables.npz')

# Load variables back from the .npz file
loaded_data = np.load(save_path, allow_pickle=True)
# X_train = loaded_data['X_train']
# X_test = loaded_data['X_test']
Y_train = loaded_data['Y_train']
Y_test = loaded_data['Y_test']
X_train_num = loaded_data['X_train_num']
X_test_num = loaded_data['X_test_num']
X_train_encoded = loaded_data['X_train_encoded']
X_test_encoded = loaded_data['X_test_encoded']

races = ['ASIAN', 'BLACK', 'HISPANIC', 'OTHER', 'WHITE']

print("Variables loaded successfully")

# Baseline Predictions

In [None]:
masks_hdf5_name = 'baseline_masks.h5'
google_drive_save_path = save_dir + masks_hdf5_name

loaded_masks = {}

with h5py.File(google_drive_save_path, 'r') as hdf:
    for race in hdf.keys():  # Iterate over the keys (which are the race names)
        loaded_masks[race] = np.array(hdf[race])  # Load each dataset as a NumPy array

masks = loaded_masks

print(f"Successfully loaded masks from path: {google_drive_save_path}")

mean_differences, p_value_tables = retraining_model_experiment(X_train_num, X_train_encoded, Y_train,
                                                               X_test_num, X_test_encoded, Y_test,
                                                               masks, races, save_name = 'baseline',
                                                               num_reruns=50, p_values=[0.05, 0.01, 0.001, 0.0001],
                                                               save_increment=5, epochs=3, batch_size=32, seed=42,
                                                               )

# Balanced Data Model Predictions

In [None]:
# File paths
balanced_data_save_path = os.path.join(save_dir, 'balanced_data.npz')
masks_hdf5_name = 'balanced_masks.h5'
google_drive_masks_path = os.path.join(save_dir, masks_hdf5_name)

if os.path.exists(balanced_data_save_path) and os.path.exists(google_drive_masks_path):
    # Load existing files
    print("Balanced dataset and masks already exist. Loading...")

    # Load balanced dataset
    loaded_data = np.load(balanced_data_save_path)
    X_train_num_bal = loaded_data['X_train_num_bal']
    X_train_encoded_bal = loaded_data['X_train_encoded_bal']
    Y_train_bal = loaded_data['Y_train_bal']
    X_test_num_bal = loaded_data['X_test_num_bal']
    X_test_encoded_bal = loaded_data['X_test_encoded_bal']
    Y_test_bal = loaded_data['Y_test_bal']

    # Load masks
    loaded_masks = {}
    with h5py.File(google_drive_masks_path, 'r') as hdf:
        for race in hdf.keys():
            loaded_masks[race] = np.array(hdf[race])
    masks_bal = loaded_masks

    print("Balanced dataset and masks loaded successfully.")
else:
    # Create and save balanced dataset and masks
    print("Balanced dataset and masks do not exist. Creating...")

    # Balance the datasets
    X_train_num_bal, X_train_encoded_bal, Y_train_bal = balance_classes(X_train_num, X_train_encoded, Y_train)
    X_test_num_bal, X_test_encoded_bal, Y_test_bal = balance_classes(X_test_num, X_test_encoded, Y_test)

    # Save balanced dataset
    np.savez(
        balanced_data_save_path,
        X_train_num_bal=X_train_num_bal,
        X_train_encoded_bal=X_train_encoded_bal,
        Y_train_bal=Y_train_bal,
        X_test_num_bal=X_test_num_bal,
        X_test_encoded_bal=X_test_encoded_bal,
        Y_test_bal=Y_test_bal
    )


    print(f"Balanced dataset saved to {balanced_data_save_path}")

    # Create and save masks
    masks_bal = {race: X_test_num_bal[:, -5+j] == 1 for j, race in enumerate(races)}
    with h5py.File(masks_hdf5_name, 'w') as hdf:
        for race, mask in masks_bal.items():
            hdf.create_dataset(race, data=np.stack(mask))

    shutil.move(masks_hdf5_name, google_drive_masks_path)
    print(f"Masks saved to {google_drive_masks_path}")


In [None]:
mean_differences, p_value_tables = retraining_model_experiment(X_train_num_bal, X_train_encoded_bal, Y_train_bal,
                                                               X_test_num_bal, X_test_encoded_bal, Y_test_bal,
                                                               masks_bal, races, save_name = 'balanced',
                                                               num_reruns=50, p_values=[0.05, 0.01, 0.001, 0.0001],
                                                               save_increment=5, epochs=1, batch_size=32, seed=42,
                                                               )

# Model Evaluation with Random Data

In [None]:
proportions = [0.04483654119629676,0.22351021102519597,0.0834425362363142,0.06299656309889928,0.5852141484432938]

In [None]:
# File paths
masks_hdf5_name = 'random_masks.h5'
google_drive_masks_path = os.path.join(save_dir, masks_hdf5_name)
data_save_path = os.path.join(save_dir, 'random_data.npz')

if os.path.exists(google_drive_masks_path) and os.path.exists(data_save_path):
    # Load existing files
    print("Files already exist. Loading...")

    # Load masks
    loaded_masks = {}
    with h5py.File(google_drive_masks_path, 'r') as hdf:
        for race in hdf.keys():
            loaded_masks[race] = np.array(hdf[race])

    rand_masks = loaded_masks

    # Load data
    loaded_data = np.load(data_save_path)
    X_train_num_rand = loaded_data['X_train_num_rand']
    X_test_num_rand = loaded_data['X_test_num_rand']

    print("Random masks and data loaded successfully.")
else:
    # Create new files
    print("Files do not exist. Creating...")

    randomn_data = generate_one_hot(proportions, X_train_num.shape[0] + X_test_num.shape[0])

    X_train_num_rand = X_train_num.copy()
    X_train_num_rand[:, -5:] = randomn_data[:X_train_num.shape[0]]
    X_test_num_rand = X_test_num.copy()
    X_test_num_rand[:, -5:] = randomn_data[X_train_num.shape[0]:]

    # Prepare race masks
    rand_masks = {race: X_test_num_rand[:, -5 + j] == 1 for j, race in enumerate(races)}

    # Save masks in HDF5 format
    with h5py.File(masks_hdf5_name, 'w') as hdf:
        for race in races:
            hdf.create_dataset(race, data=np.stack(rand_masks[race]))  # Stack masks into a single array per race

    # Move the HDF5 file to Google Drive
    shutil.move(masks_hdf5_name, google_drive_masks_path)
    print(f'HDF5 file with masks saved to {google_drive_masks_path}')

    # Save X_train_num_rand and X_test_num_rand in an NPZ file
    np.savez(
        data_save_path,
        X_train_num_rand=X_train_num_rand,
        X_test_num_rand=X_test_num_rand
    )
    print(f'Random data saved to {data_save_path}')

In [None]:
mean_differences, p_value_tables = retraining_model_experiment(X_train_num_rand, X_train_encoded, Y_train,
                                                               X_test_num_rand, X_test_encoded, Y_test,
                                                               rand_masks, races, save_name = 'random',
                                                               num_reruns=50, p_values=[0.05, 0.01, 0.001, 0.0001],
                                                               save_increment=5, epochs=3, batch_size=32, seed=42,
                                                               )

# Random Balanced Data

In [None]:
save_dir = '/content/drive/MyDrive/Data/'


save_path = os.path.join(save_dir, 'data_variables.npz')

# Load variables back from the .npz file
loaded_data = np.load(save_path, allow_pickle=True)
# X_train = loaded_data['X_train']
# X_test = loaded_data['X_test']
Y_train = loaded_data['Y_train']
Y_test = loaded_data['Y_test']
X_train_num = loaded_data['X_train_num']
X_test_num = loaded_data['X_test_num']
X_train_encoded = loaded_data['X_train_encoded']
X_test_encoded = loaded_data['X_test_encoded']

races = ['ASIAN', 'BLACK', 'HISPANIC', 'OTHER', 'WHITE']

print("Original data variables loaded successfully")


save_path = os.path.join(save_dir, 'random_data.npz')

# Load variables back from the .npz file
loaded_data = np.load(save_path, allow_pickle=True)
# X_train = loaded_data['X_train']
# X_test = loaded_data['X_test']

X_train_num_rand = loaded_data['X_train_num_rand']
X_test_num_rand = loaded_data['X_test_num_rand']

print("Random data variables loaded successfully")

## Loading the random balanced data

In [None]:
random_balanced_data_save_path = os.path.join(save_dir, 'random_balanced_data.npz')
masks_hdf5_name = 'random_balanced_masks.h5'
google_drive_masks_path = os.path.join(save_dir, masks_hdf5_name)

if not os.path.exists(random_balanced_data_save_path) and not os.path.exists(google_drive_masks_path):
    X_train_num_rand_bal, X_train_enc_rand_bal, Y_train_rand_bal = balance_classes(X_train_num_rand, X_train_encoded, Y_train)
    X_test_num_rand_bal, X_test_enc_rand_bal, Y_test_rand_bal = balance_classes(X_test_num_rand, X_test_encoded, Y_test)

    np.savez(
        random_balanced_data_save_path,
        X_train_num_rand_bal=X_train_num_rand_bal,
        X_train_enc_rand_bal=X_train_enc_rand_bal,
        Y_train_rand_bal=Y_train_rand_bal,
        X_test_num_rand_bal=X_test_num_rand_bal,
        X_test_enc_rand_bal=X_test_enc_rand_bal,
        Y_test_rand_bal=Y_test_rand_bal
    )

    print(f"Random balanced dataset saved to {random_balanced_data_save_path}")

    # Create and save masks
    masks_rand_bal = {race: X_test_num_rand_bal[:, -5+j] == 1 for j, race in enumerate(races)}
    with h5py.File(masks_hdf5_name, 'w') as hdf:
        for race, mask in masks_rand_bal.items():
            hdf.create_dataset(race, data=np.stack(mask))

    shutil.move(masks_hdf5_name, google_drive_masks_path)
    print(f"Masks saved to {google_drive_masks_path}")

else:
    # Load existing files
    print("Random balanced dataset and masks already exist. Loading...")

    # Load random balanced dataset
    loaded_data = np.load(random_balanced_data_save_path)
    X_train_num_rand_bal = loaded_data['X_train_num_rand_bal']
    X_train_enc_rand_bal = loaded_data['X_train_enc_rand_bal']
    Y_train_rand_bal = loaded_data['Y_train_rand_bal']
    X_test_num_rand_bal = loaded_data['X_test_num_rand_bal']
    X_test_enc_rand_bal = loaded_data['X_test_enc_rand_bal']
    Y_test_rand_bal = loaded_data['Y_test_rand_bal']

    # Load masks
    loaded_masks = {}
    with h5py.File(google_drive_masks_path, 'r') as hdf:
        for race in hdf.keys():
            loaded_masks[race] = np.array(hdf[race])
    masks_rand_bal = loaded_masks

    print("Random balanced dataset and masks loaded successfully.")

In [None]:
mean_differences, p_value_tables = retraining_model_experiment(X_train_num_rand_bal, X_train_enc_rand_bal, Y_train_rand_bal,
                                                               X_test_num_rand_bal, X_test_enc_rand_bal, Y_test_rand_bal,
                                                               masks_rand_bal, races, save_name='rand_bal',
                                                               num_reruns=50, p_values=[0.05, 0.01, 0.001, 0.0001],
                                                               save_increment=3, epochs=1, batch_size=32, seed=42,
                                                               )