In [16]:
# Import libraries
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random
import json
from typing import Union
import numpy as np
from disagreement import Disagreement
import itertools
import joblib
import os
import collections

In [18]:
!ls /storage/scratch/e17-fyp-xai/projects/e17-4yp-using-machine-learning-in-high-stake-settings/code/new/model_outputs/artifacts/random_forest_t_1000_md_10/

random_forest_t_1000_md_10_fold_1_2016-01-07.sav
random_forest_t_1000_md_10_fold_2_2015-09-09.sav
random_forest_t_1000_md_10_fold_3_2015-05-12.sav
random_forest_t_1000_md_10_fold_4_2015-01-12.sav
random_forest_t_1000_md_10_fold_5_2014-09-14.sav
random_forest_t_1000_md_10_fold_6_2014-05-17.sav
test_prediction_fold_1_2016-01-07.csv
test_prediction_fold_2_2015-09-09.csv
test_prediction_fold_3_2015-05-12.csv
test_prediction_fold_4_2015-01-12.csv
test_prediction_fold_5_2014-09-14.csv
test_prediction_fold_6_2014-05-17.csv


In [34]:
root = "/storage/scratch/e17-fyp-xai/projects/e17-4yp-using-machine-learning-in-high-stake-settings/code/new/model_outputs/"
art_root = root + "artifacts/"
analysis_path = root + "analysis/2024/"

models_list = ["random_forest_t_1000_md_10", "logistic_regression_mi_250_p_l2_standard", "xgb_classifier_t_200_md_10_lr_0.1", 
              "lgbm_classifier_numl_100_md_10_lr_0.05", "nn_lr_0.005_loss_binary_crossentropy_activation_relu_epochs_40"] 

prediction_file = "test_prediction_fold_6_2014-05-17.csv"

fold = 'fold6' 
save_path = analysis_path + f"compare_top_projects/{fold}/"

In [20]:
# Helper functions
# Function to save images
def save_image(caption, path):
    set_path = f'{path}/{caption}.png'
    plt.savefig(set_path)

def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


def save_json(dict_obj: Union[dict, list], path: str):
    writable_json = json.dumps(dict_obj, indent=4)
    with open(path, 'w') as file:
        file.write(writable_json)

In [35]:
make_directory(save_path)

In [22]:
def get_project_agreement(models_and_projects, model_1, model_2):
    df_1 = models_and_projects[model_1]
    df_2 = models_and_projects[model_2]

    projects_list_1 = set(df_1['Project ID'].tolist())
    projects_list_2 = set(df_2['Project ID'].tolist())

    # Get the intersection
    intersected_set = projects_list_1.intersection(projects_list_2)
    intersected_list = list(intersected_set)
    perc = len(intersected_list)/1000
    
    return perc

In [23]:
def get_rank_agreement(models_and_projects, model_1, model_2):
    df_1 = models_and_projects[model_1]
    df_2 = models_and_projects[model_2]

    projects_list_1 = df_1['Project ID'].tolist()
    projects_list_2 = df_2['Project ID'].tolist()

    identical_count = 0
        
    for item_1, item_2 in zip(projects_list_1, projects_list_2):
        #print(item_1, item_2)
        if item_1 == item_2:
            print(item_1, item_2)
            identical_count += 1

    result = identical_count/1000
    
    return result

In [36]:
# Obtain the first 1000 projects for each model
models_comparison = {}
models_and_projects = {}
for model in models_list:
    # Read file
    print(f'{model}---------------------------')
    predictions = pd.read_csv(f'{art_root}{model}/{prediction_file}')
    # Sort by prediction prob
    predictions_sorted = predictions.sort_values("1", ascending=False)
    # Obtain the projects with label 1
    projects_label_1 = predictions_sorted[:1000][["Project ID", "1"]]
    print(projects_label_1)
    # Add to dictionary
    models_and_projects[model] = projects_label_1

# For each model pair, get the project agreement
for model_pair in list(itertools.combinations(models_list, 2)):
    print(f'For {model_pair[0]} and {model_pair[1]} --------------------------')
    agreement = {
        'project_agreement': 0,
        'rank_agreement': 0
    }
    project_agreement = get_project_agreement(models_and_projects, model_pair[0], model_pair[1])
    print(f'Project agreement = {project_agreement}')
    agreement['project_agreement'] = project_agreement
    
    rank_agreement = get_rank_agreement(models_and_projects, model_pair[0], model_pair[1])
    print(f'Rank agreement = {rank_agreement}')
    agreement['rank_agreement'] = rank_agreement

    models_comparison[f'{model_pair[0]} and {model_pair[1]}'] = agreement


# Save data
save_json(models_comparison, save_path+'agreement_levels.json')


random_forest_t_1000_md_10---------------------------
                            Project ID         1
7624  0465c97b99e8324ba52e7aea2e082bae  0.638533
9623  46d9fcf425c112a87888aa638d7756db  0.636116
1555  337267662f600c3b8829b6b454be3804  0.636116
1378  9cc60b298562c5908679f522aab9aa61  0.634878
1066  680bcd7ab537afee24a8652fa1400d03  0.633206
...                                ...       ...
2342  34278bfc7cfae673b742bbb21d348f26  0.460957
3334  092c37b6e95b7ee1d5821fdfefbda428  0.460920
1791  cbb5aa0846a6b5b6e43aa36180ae3fd6  0.460839
5695  737938e3ca04ec72878d4bbd99d94e20  0.460806
7341  a3530403e2614d5d09006d1e8469b080  0.460738

[1000 rows x 2 columns]
logistic_regression_mi_250_p_l2_standard---------------------------
                            Project ID         1
7791  1dd471974badd566301a954b298b4b1e  1.000000
7091  de40ac9d4da386ec1533ef58782b9ab8  1.000000
1347  956734e1b50ef3c3316060e1ee11bbab  0.999996
9777  5ef40760ffcfe5c5b315720663bbd258  0.999995
1769  16c6aa8ba65bdc