In [1]:
# Import libraries
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random
import json
from typing import Union
import numpy as np
import itertools
from disagreement import Disagreement
import joblib

In [2]:
from compactor.MaxNonActivatedCompactor import MaxNonActivatedCompactor
from compactor.ActivatedCompactor import ActivatedCompactor
from compactor.MaxCompactor import MaxCompactor

In [1]:
!ls /storage/scratch/e17-fyp-xai/projects/e17-4yp-using-machine-learning-in-high-stake-settings/code/new/model_outputs/artifacts/nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40/

nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_1_2016-01-07.h5
nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_2_2015-09-09.h5
nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_3_2015-05-12.h5
nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_4_2015-01-12.h5
nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_5_2014-09-14.h5
nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_6_2014-05-17.h5
test_prediction_fold_1_2016-01-07.csv
test_prediction_fold_2_2015-09-09.csv
test_prediction_fold_3_2015-05-12.csv
test_prediction_fold_4_2015-01-12.csv
test_prediction_fold_5_2014-09-14.csv
test_prediction_fold_6_2014-05-17.csv


In [4]:
root = "/storage/scratch/e17-fyp-xai/projects/e17-4yp-using-machine-learning-in-high-stake-settings/code/new/model_outputs/"
model_path = "nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40/" # Change
model_name = "nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40_fold_3_2015-05-12.h5" # Change
xai_root = root + "xai/2024/"
art_root = root + "artifacts/"
analysis_path = root + "analysis/2024/"
json_file_path = xai_root + model_path + "all_exp.json"
processed_data_path = "/storage/scratch/e17-fyp-xai/projects/e17-4yp-using-machine-learning-in-high-stake-settings/code/processed_data/processed_final_data_latest.csv"

fold = 'fold3' # Change
model_type = 'nn' # Types: nn, xgb, lgbm, lr, rf # Change

model_type_save = 'NN'

save_path = analysis_path + model_path + f"agreement_levels_all_explanations/{fold}/"

In [5]:
create_nocomp = True
create_max = True
create_act = True
create_maxnonact = True

In [6]:
# Change directory and import
os.chdir("/storage/scratch/e17-fyp-xai/projects/mad_v3/e17-4yp-using-machine-learning-in-high-stake-settings/code/")
import config
categorical_cols = config.CATEGORICAL_COLS

In [6]:
# Helper functions
# Function to save images
def save_image(caption, path):
    set_path = f'{path}/{caption}.png'
    plt.savefig(set_path)

def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [18]:
make_directory(save_path)

In [7]:
# Read dataframe to get actual label and for the categorical columns
processed = pd.read_csv(processed_data_path)
processed.columns

Index(['Unnamed: 0', 'Project ID', 'Project Posted Date', 'Project Type',
       'Project Posted Month', 'Project Subject Category Tree', 'Project Cost',
       'Project Subject Subcategory Tree', 'Project Grade Level Category',
       'Project Resource Category', 'School Metro Type', 'School State',
       'Teacher Project Posted Sequence', 'Label', 'Teacher Success Rate',
       'Teacher Success Rate Imputed', 'School City Success Rate',
       'School City Success Rate Imputed', 'School Success Rate',
       'School Success Rate Imputed', 'School County Success Rate',
       'School County Success Rate Imputed', 'Project Count in State',
       'Project Need Statement Length', 'School City',
       'Project Need Statement', 'Resource Vendor Name', 'Teacher Prefix',
       'Project Short Description Length', 'School County',
       'Project Count in County', 'Project Title', 'Project Essay',
       'Resource Cost Percentage', 'Project Essay Length',
       'School Percentage Free Lun

In [8]:
# Load explanations
explanations = json.load(open(json_file_path))

In [9]:
xai_keys_list = list(explanations[fold].keys())
print(xai_keys_list)

['lime_fs_auto_sai_True_nums_1000', 'lime_fs_auto_sai_False_nums_5000', 'deepshap_lpf_None']


In [10]:
# Set n features values
top_keys = list(explanations[fold][xai_keys_list[0]]['top'].keys())
bottom_keys = list(explanations[fold][xai_keys_list[0]]['bottom'].keys())
top_10_perc_uncompressed = round(len(explanations[fold][xai_keys_list[0]]['top'][top_keys[0]])* 0.1) 
top_20_perc_uncompressed = round(len(explanations[fold][xai_keys_list[0]]['top'][top_keys[0]])* 0.2)
top_10_perc_compressed = round(31* 0.1) 
top_20_perc_compressed = round(31* 0.2)
print(top_10_perc_uncompressed, top_20_perc_uncompressed, top_10_perc_compressed, top_20_perc_compressed)

61 122 3 6


In [11]:
def disagreement_average(explanations1: dict, explanations2: dict, both_local: bool, k: int, features_F: list, method = None, raw_data = None, debug = False) -> dict:
    
    disagreement_mean = {'feature_agreement': 0.0,
                         'rank_agreement': 0.0,
                         'sign_agreement': 0.0,
                         'signed_rank_agreement': 0.0}

    # explanations_1 is always local
    # explanations_2 can either be local or global
    
    for project_id in explanations1.keys():
        if debug:
            print(f"******************Project ID: {project_id}************************")
        
        disagreement_calc = None

       
        # Initialize disagreement calculation
        if both_local:
            disagreement_calc = Disagreement(explanations1[project_id], explanations2[project_id])
        else:
            disagreement_calc = Disagreement(explanations2, explanations1[project_id])
        
        if debug:
            print("Initial Explanation 1 \n", disagreement_calc.sorted_explanation1[:k])
            print("Initial Explanation 2 \n", disagreement_calc.sorted_explanation2[:k])
            
        if method=="max":
            max_compactor = MaxCompactor(categorical_cols)
            disagreement_calc.compact_features(max_compactor)
            
            
        elif method == "activated" and raw_data is not None:
            activated_features = {}
            for feature in categorical_cols:
                activated_features[feature] = f"{feature}_{raw_data[raw_data['Project ID'] == project_id ][feature].values[0]}"
            activated_compactor = ActivatedCompactor(activated_features)
            disagreement_calc.compact_features(activated_compactor)
            
        elif method == "maxnonactivated" and raw_data is not None:
            activated_features = {}
            for feature in categorical_cols:
                activated_features[feature] = f"{feature}_{raw_data[raw_data['Project ID'] == project_id ][feature].values[0]}"
            max_non_activated_compactor = MaxNonActivatedCompactor(activated_features)
            disagreement_calc.compact_features(max_non_activated_compactor)
            
        if debug:
            print("After Compact Explanation 1 \n",disagreement_calc.sorted_explanation1[:k])
            print("After Compact Explanation 2 \n",disagreement_calc.sorted_explanation2[:k])
            
            
        disagreement = disagreement_calc.get_disagreement(k, features_F)
        
        if debug:
            print("Disagreement : ", disagreement)
        
        for key in list(disagreement_mean.keys()):
            if disagreement[key] == None:
                raise ValueError(f"The value for key '{key}' in disagreement_mean is None.")
            disagreement_mean[key] += disagreement[key]
        
        if debug:
            print("****************************************")

    for key in list(disagreement_mean.keys()):
        disagreement_mean[key] = round(disagreement_mean[key] / len(explanations1.keys()),3)
    
    if debug:
        print("*********************************************")
        print(disagreement_mean)
        print("*********************************************")

    
    return disagreement_mean

In [12]:
def load_model_and_feat(model_path, model_name, model_type):
    # Load the model
    model = None
    importance = None
    feat_names = None
    
    if model_type == 'nn':
        model_file_path = f'{model_path}{model_name}' 
        model = keras.models.load_model(model_file_path)
        # Skip the rest

    elif model_type == 'lgbm':
        model_file_path = f'{model_path}{model_name}'
        model = joblib.load(model_file_path)
        # Load the feature importance array
        importance = model.feature_importances_
        # Get the feature names
        feat_names = model.feature_name_

    elif model_type == 'lr':
        model_file_path = f'{model_path}{model_name}'
        model = joblib.load(model_file_path)
        # Load the feature importance array
        importance = model.coef_[0]
        # Get the feature names
        feat_names = model.feature_names_in_
    
    else:
        model_file_path = f'{model_path}{model_name}'
        model = joblib.load(model_file_path)
        # Load the feature importance array
        importance = model.feature_importances_
        # Get the feature names
        feat_names = model.feature_names_in_
    
    return importance, feat_names

In [13]:
# Function to select the top k features and plot
def select_top_features(importance, feat_names):
    # Create df and sort
    df = pd.DataFrame({"Feature": feat_names, "Importance": importance})
    #df_sorted = df.sort_values("Importance", ascending=False)
    df_sorted = df.reindex(df.Importance.abs().sort_values(ascending=False).index)
    return df_sorted

In [14]:
'''
global_importance, feat_names = load_model_and_feat(art_root+model_path, model_name, model_type)
#print(global_importance, feat_names)
global_explanation = select_top_features(global_importance, feat_names)
global_explanation.values
'''

array([['Project Cost', 0.21832490517874864],
       ['Resource Cost', 0.10584395574442541],
       ['School Success Rate', 0.06696661597145076],
       ...,
       ['Project Subject Subcategory Tree_Special Needs, Warmth, Care & Hunger',
        0.0],
       ['Project Subject Subcategory Tree_Team Sports, Visual Arts', 0.0],
       ['Project Subject Subcategory Tree_Gym & Fitness, Warmth, Care & Hunger',
        0.0]], dtype=object)

In [15]:
# Create empty df - define the column names
df = pd.DataFrame(columns=['Explanations', 'Compactor', 'Model', 
                           'Feature(10%)', 'Feature(20%)', 
                           'Rank(10%)', 'Rank(20%)', 
                           'Sign(10%)', 'Sign(20%)', 
                           'SignedRank(10%)', 'SignedRank(20%)'])

Local vs local

In [12]:
for key_pair in list(itertools.combinations(xai_keys_list, 2)):
    exp_combined_1 = explanations[fold][key_pair[0]]['top'] | explanations[fold][key_pair[0]]['bottom']
    exp_combined_2 = explanations[fold][key_pair[1]]['top'] | explanations[fold][key_pair[1]]['bottom']


    # Top 10%
    disagreement_no_comp_top_10_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_10_perc_uncompressed, None)
    disagreement_max_comp_top_10_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_10_perc_compressed, None, 'max', processed)
    disagreement_act_comp_top_10_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_10_perc_compressed, None, 'activated', processed)
    disagreement_maxnonact_comp_top_10_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_10_perc_compressed, None, 'maxnonactivated', processed)

    # Top 20%
    disagreement_no_comp_top_20_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_20_perc_uncompressed, None)
    disagreement_max_comp_top_20_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_20_perc_compressed, None, 'max', processed)
    disagreement_act_comp_top_20_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_20_perc_compressed, None, 'activated', processed)
    disagreement_maxnonact_comp_top_20_perc = disagreement_average(exp_combined_1, exp_combined_2, True, top_20_perc_compressed, None, 'maxnonactivated', processed)

    # Add to dataframe
    df.loc[len(df)] = {'Explanations' : f'{key_pair[0]}_vs_{key_pair[1]}', 
                    'Compactor' : 'No compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_no_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_no_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_no_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_no_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_no_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_no_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_no_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_no_comp_top_20_perc['signed_rank_agreement']}
    
    df.loc[len(df)] = {'Explanations' : f'{key_pair[0]}_vs_{key_pair[1]}', 
                    'Compactor' : 'Max compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_max_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_max_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_max_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_max_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_max_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_max_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_max_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_max_comp_top_20_perc['signed_rank_agreement']}
    
    df.loc[len(df)] = {'Explanations' : f'{key_pair[0]}_vs_{key_pair[1]}', 
                    'Compactor' : 'Activated compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_act_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_act_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_act_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_act_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_act_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_act_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_act_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_act_comp_top_20_perc['signed_rank_agreement']}

    df.loc[len(df)] = {'Explanations' : f'{key_pair[0]}_vs_{key_pair[1]}', 
                    'Compactor' : 'Non-activated max compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_maxnonact_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_maxnonact_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_maxnonact_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_maxnonact_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_maxnonact_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_maxnonact_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_maxnonact_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_maxnonact_comp_top_20_perc['signed_rank_agreement']}



{'feature_agreement': 0.146, 'rank_agreement': 0.007, 'sign_agreement': 0.08, 'signed_rank_agreement': 0.006}
{'feature_agreement': 0.715, 'rank_agreement': 0.433, 'sign_agreement': 0.626, 'signed_rank_agreement': 0.388}
{'feature_agreement': 0.525, 'rank_agreement': 0.334, 'sign_agreement': 0.449, 'signed_rank_agreement': 0.301}
{'feature_agreement': 0.715, 'rank_agreement': 0.433, 'sign_agreement': 0.627, 'signed_rank_agreement': 0.389}
{'feature_agreement': 0.039, 'rank_agreement': 0.003, 'sign_agreement': 0.028, 'signed_rank_agreement': 0.003}
{'feature_agreement': 0.126, 'rank_agreement': 0.055, 'sign_agreement': 0.121, 'signed_rank_agreement': 0.055}


KeyboardInterrupt: 

Global vs local

In [16]:
'''
for local_key in xai_keys_list:
    exp_combined = explanations[fold][local_key]['top'] | explanations[fold][local_key]['bottom']

    # Top 10%
    disagreement_no_comp_top_10_perc = disagreement_average(exp_combined, global_explanation.values, False, top_10_perc_uncompressed, None)
    disagreement_max_comp_top_10_perc = disagreement_average(exp_combined, global_explanation.values, False, top_10_perc_compressed, None, 'max', processed)
    disagreement_act_comp_top_10_perc = disagreement_average(exp_combined, global_explanation.values, False, top_10_perc_compressed, None, 'activated', processed)
    disagreement_maxnonact_comp_top_10_perc = disagreement_average(exp_combined, global_explanation.values, False, top_10_perc_compressed, None, 'maxnonactivated', processed)

    # Top 20%
    disagreement_no_comp_top_20_perc = disagreement_average(exp_combined, global_explanation.values, False, top_20_perc_uncompressed, None)
    disagreement_max_comp_top_20_perc = disagreement_average(exp_combined, global_explanation.values, False, top_20_perc_compressed, None, 'max', processed)
    disagreement_act_comp_top_20_perc = disagreement_average(exp_combined, global_explanation.values, False, top_20_perc_compressed, None, 'activated', processed)
    disagreement_maxnonact_comp_top_20_perc = disagreement_average(exp_combined, global_explanation.values, False, top_20_perc_compressed, None, 'maxnonactivated', processed)

    # Add to dataframe
    df.loc[len(df)] = {'Explanations' : f'Global_vs_{local_key}', 
                    'Compactor' : 'No compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_no_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_no_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_no_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_no_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_no_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_no_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_no_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_no_comp_top_20_perc['signed_rank_agreement']}
    
    df.loc[len(df)] = {'Explanations' : f'Global_vs_{local_key}', 
                    'Compactor' : 'Max compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_max_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_max_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_max_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_max_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_max_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_max_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_max_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_max_comp_top_20_perc['signed_rank_agreement']}

    df.loc[len(df)] = {'Explanations' : f'Global_vs_{local_key}', 
                    'Compactor' : 'Activated compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_act_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_act_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_act_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_act_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_act_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_act_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_act_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_act_comp_top_20_perc['signed_rank_agreement']}

    
    df.loc[len(df)] = {'Explanations' : f'Global_vs_{local_key}', 
                    'Compactor' : 'Non-activated max compactor', 
                    'Model' : model_type_save, 
                    'Feature(10%)': disagreement_maxnonact_comp_top_10_perc['feature_agreement'], 
                    'Feature(20%)': disagreement_maxnonact_comp_top_20_perc['feature_agreement'], 
                    'Rank(10%)': disagreement_maxnonact_comp_top_10_perc['rank_agreement'], 
                    'Rank(20%)': disagreement_maxnonact_comp_top_20_perc['rank_agreement'], 
                    'Sign(10%)': disagreement_maxnonact_comp_top_10_perc['sign_agreement'], 
                    'Sign(20%)': disagreement_maxnonact_comp_top_20_perc['sign_agreement'], 
                    'SignedRank(10%)': disagreement_maxnonact_comp_top_10_perc['signed_rank_agreement'], 
                    'SignedRank(20%)': disagreement_maxnonact_comp_top_20_perc['signed_rank_agreement']}

'''    

In [20]:
#Save df as csv
df.to_csv(f'{save_path}agreement_all_exp_{model_type}_{fold}.csv')