In [9]:
import numpy as np
import pandas as pd
import random
import os
import json

from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, roc_auc_score, average_precision_score, precision_recall_curve
import tensorflow as tf

from collections import defaultdict

### Define paths for relevant directories

In [10]:
labelled_conductivity_database_path = r"..\data\ionic_conductivity_database_wo_duplicates_wclusters.csv"
train_val_split_path = r"..\data\training_validation_testing_splits\classification"

### Generate Controls

In [7]:
labelled_conductivity_database = pd.read_csv(labelled_conductivity_database_path)

num_runs = 1000

results = defaultdict(lambda: defaultdict(list))

seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

for val_strategy in ["kfold", "lococv", "test"]:

    split_path = os.path.join(train_val_split_path, val_strategy)

    for run in range(num_runs):

        labelled_conductivity_database['is_superionic_shuffled'] = np.random.permutation(labelled_conductivity_database['is_superionic'].to_numpy())

        if val_strategy == 'kfold' or val_strategy == "lococv":

            for fold in os.listdir(split_path):

                train_df = pd.read_csv(os.path.join(split_path, fold, "train.csv"))
                val_df = pd.read_csv(os.path.join(split_path, fold, "val.csv"))

                avg_target_train = np.mean(train_df['is_superionic'])
                pred_avg_c = np.full_like(val_df['is_superionic'].to_numpy(), np.round(avg_target_train))
                acc_avg_c = accuracy_score(val_df['is_superionic'], pred_avg_c)
                mcc_avg_c = matthews_corrcoef(val_df['is_superionic'], pred_avg_c)
                f1_avg_c = f1_score(val_df['is_superionic'], pred_avg_c)
                roc_auc_avg_c = roc_auc_score(val_df['is_superionic'], pred_avg_c)
                pr_auc_avg_c = average_precision_score(val_df['is_superionic'], pred_avg_c)

                precision_avg, recall_avg, _ = precision_recall_curve(val_df['is_superionic'], pred_avg_c)


                pred_shuffled_c = pd.merge(labelled_conductivity_database, val_df, on='icsd_collectioncode', how='inner')['is_superionic_shuffled']
                acc_shuffled_c = accuracy_score(val_df['is_superionic'], pred_shuffled_c)
                mcc_shuffled_c = matthews_corrcoef(val_df['is_superionic'], pred_shuffled_c)
                f1_shuffled_c = f1_score(val_df['is_superionic'], pred_shuffled_c)
                roc_auc_shuffled_c = roc_auc_score(val_df['is_superionic'], pred_shuffled_c)
                pr_auc_shuffled_c = average_precision_score(val_df['is_superionic'], pred_shuffled_c)


                results[val_strategy][fold].append({
                    'binary_accuracy_avg': acc_avg_c,
                    'mcc_avg': mcc_avg_c,
                    'f1_score_avg': f1_avg_c,
                    'roc_auc_avg': roc_auc_avg_c,
                    'pr_auc_avg': pr_auc_avg_c,
                    'binary_accuracy_shuffled': acc_shuffled_c,
                    'mcc_shuffled': mcc_shuffled_c,
                    'f1_score_shuffled': f1_shuffled_c,
                    'roc_auc_shuffled': roc_auc_shuffled_c,
                    'pr_auc_shuffled': pr_auc_shuffled_c
                })

        elif val_strategy == "test":

            train_val_path = os.path.join(train_val_split_path, "lococv", "fold_0")

            train_df = pd.read_csv(os.path.join(train_val_path, "train.csv"))
            val_df = pd.read_csv(os.path.join(train_val_path, "val.csv"))

            train_val_df = pd.concat([train_df, val_df])

            test_df = pd.read_csv(os.path.join(split_path, "test.csv"))

            avg_target_train = np.mean(train_val_df['is_superionic'])
            pred_avg_c = np.full_like(test_df['is_superionic'].to_numpy(), np.round(avg_target_train))
            acc_avg_c = accuracy_score(test_df['is_superionic'], pred_avg_c)
            mcc_avg_c = matthews_corrcoef(test_df['is_superionic'], pred_avg_c)
            f1_avg_c = f1_score(test_df['is_superionic'], pred_avg_c)
            roc_auc_avg_c = roc_auc_score(test_df['is_superionic'], pred_avg_c)
            pr_auc_avg_c = average_precision_score(test_df['is_superionic'], pred_avg_c)

            pred_shuffled_c = pd.merge(labelled_conductivity_database, test_df, on='icsd_collectioncode', how='inner')['is_superionic_shuffled']
            acc_shuffled_c = accuracy_score(test_df['is_superionic'], pred_shuffled_c)
            mcc_shuffled_c = matthews_corrcoef(test_df['is_superionic'], pred_shuffled_c)
            f1_shuffled_c = f1_score(test_df['is_superionic'], pred_shuffled_c)
            roc_auc_shuffled_c = roc_auc_score(test_df['is_superionic'], pred_shuffled_c)
            pr_auc_shuffled_c = average_precision_score(test_df['is_superionic'], pred_shuffled_c)

            results[val_strategy]["fold_0"].append({
                'binary_accuracy_avg': acc_avg_c,
                'mcc_avg': mcc_avg_c,
                'f1_score_avg': f1_avg_c,
                'roc_auc_avg': roc_auc_avg_c,
                'pr_auc_avg': pr_auc_avg_c,
                'binary_accuracy_shuffled': acc_shuffled_c,
                'mcc_shuffled': mcc_shuffled_c,
                'f1_score_shuffled': f1_shuffled_c,
                'roc_auc_shuffled': roc_auc_shuffled_c,
                'pr_auc_shuffled': pr_auc_shuffled_c
            })

results_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
results_std = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))  
overall_results = defaultdict(lambda: defaultdict(list))
for val_strategy in ["kfold", "lococv", "test"]:
    results_val = results[val_strategy]
    for fold, runs in results[val_strategy].items():
        for metric in runs[0].keys():
            metric_avg = np.mean([run[metric] for run in runs])
            metric_std = np.std([run[metric] for run in runs])
            overall_results[val_strategy][metric].extend([run[metric] for run in runs])
            results_avg[val_strategy][fold][metric] = metric_avg
            results_std[val_strategy][fold][metric] = metric_std
            
overall_means = defaultdict(lambda: defaultdict(float))
overall_stds = defaultdict(lambda: defaultdict(float))
for val_strategy, metrics in overall_results.items():
    for metric, values in metrics.items():
        overall_means[val_strategy][metric] = np.mean(values)
        overall_stds[val_strategy][metric] = np.std(values)

In [8]:
controls = {'fold_averages': results_avg, 'fold_standard_deviations': results_std, 'overall_averages': overall_means, 'overall_standard_deviations': overall_stds}

controls_filename = "..\data\controls.json"
#with open(controls_filename, "w") as json_file:
#    json.dump(controls, json_file, indent=4)