In [None]:
# Code to run simulations on synthetic data to test the accuracy of the new algorithm

In [7]:
import numpy as np
import pandas as pd
import pickle
import sys
import gc
import os
import matplotlib.pyplot as plt
import glob

In [8]:
# Base path
tag1 = "medical"
path = "/domino/datasets/local/smote_msfb/public_datasets/" + tag1 + "/" ## Path of the input storage data
output_loc = "/repos/smote_msfb/public_datasets/" + tag1 + "/"

In [9]:
output_loc

'/repos/smote_msfb/public_datasets/medical/'

In [10]:
# Get all matches that start with "Modelling_Datasets_*"
all_matches = glob.glob(os.path.join(path, "target*"))

# Keep only the matches that are directories
matching_folders = [f for f in all_matches if os.path.isdir(f)]

# Print the matching folders
for folder in matching_folders:
    print(folder)

/domino/datasets/local/smote_msfb/public_datasets/medical/target_2
/domino/datasets/local/smote_msfb/public_datasets/medical/target_6
/domino/datasets/local/smote_msfb/public_datasets/medical/target_3
/domino/datasets/local/smote_msfb/public_datasets/medical/target_1


In [11]:
# Known classifier algorithms (lowercase, underscore style)
known_classifiers = ["logistic_regression", "naive_bayes", "knn_jaccard"]

In [12]:
# Datasets to process
file_names = matching_folders

# Initialize dataframe
exp_data = pd.DataFrame()

# Loop through datasets
for file_name in file_names:  
             
    current_folder_path = os.path.join(file_name)
    
    print("Current Folder Path :", current_folder_path)
        
    if not os.path.exists(current_folder_path):             
        continue

    # Get files starting with the correct prefix
    files = [
        f for f in os.listdir(current_folder_path) 
        if f.startswith("Cross_Validation_metrics_") 
    ]
         
    for f in files:
        print("Current file processing :", f)
        file_path = os.path.join(current_folder_path, f)
        if not os.path.exists(file_path):
            continue         

        output_data = pd.read_csv(file_path)

        # Strip off file extension and prefix
        f_base = f.rsplit(".", 1)[0]
        prefix = "Cross_Validation_metrics_"
        if f_base.startswith(prefix):
            full_algo_str = f_base[len(prefix):]

            matched_clf = None
            for clf in known_classifiers:
                if full_algo_str.endswith(clf):
                    matched_clf = clf
                    break

            if matched_clf:
                resample_algo_f = full_algo_str[:-(len(matched_clf) + 1)]  # remove '_' + classifier
                model_algo = matched_clf
            else:
                resample_algo_f = full_algo_str
                model_algo = "unknown"

            output_data["resample_algo"] = resample_algo_f
            output_data["model_algo"] = model_algo

        output_data['file_name'] = file_name[file_name.rfind('/') + 1:] 
        exp_data = pd.concat([exp_data, output_data], ignore_index=True)

Current Folder Path : /domino/datasets/local/smote_msfb/public_datasets/medical/target_2
Current file processing : Cross_Validation_metrics_smoten_naive_bayes.zip
Current file processing : Cross_Validation_metrics_smoten_logistic_regression.zip
Current file processing : Cross_Validation_metrics_smoten_knn_jaccard.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_logistic_regression.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_knn_jaccard.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_naive_bayes.zip
Current Folder Path : /domino/datasets/local/smote_msfb/public_datasets/medical/target_6
Current file processing : Cross_Validation_metrics_smoten_naive_bayes.zip
Current file processing : Cross_Validation_metrics_smoten_logistic_regression.zip
Current file processing : Cross_Validation_metrics_smoten_knn_jaccard.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_logistic_regression.zip
Current file processing : Cros

In [13]:
exp_data['dataset_name'] = tag1

In [14]:
exp_data.head(9)

Unnamed: 0,Avg_Precision,Avg_Recall,Avg_ROC_AUC,Avg_Gmean,resample_algo,model_algo,file_name,dataset_name
0,0.626169,0.962291,0.951679,0.869093,smoten,naive_bayes,target_2,medical
1,0.882097,0.943578,0.981098,0.947822,smoten,logistic_regression,target_2,medical
2,0.673707,0.90225,0.92688,0.868286,smoten,knn_jaccard,target_2,medical
3,0.891204,0.93979,0.983595,0.948077,smote_msfb1,logistic_regression,target_2,medical
4,0.733422,0.849389,0.944075,0.866169,smote_msfb1,knn_jaccard,target_2,medical
5,0.75292,0.81168,0.942403,0.853652,smote_msfb1,naive_bayes,target_2,medical
6,0.423385,0.809874,0.900724,0.814363,smoten,naive_bayes,target_6,medical
7,0.815898,0.868277,0.968056,0.916436,smoten,logistic_regression,target_6,medical
8,0.511285,0.765756,0.906537,0.82051,smoten,knn_jaccard,target_6,medical


In [15]:
exp_data['Avg_f1_score'] = 1 / ( (1/exp_data['Avg_Precision']) + (1/exp_data['Avg_Recall']) )
exp_data = exp_data.sort_values(by=['resample_algo', 'model_algo'])

In [16]:
exp_data['model_algo'].unique()

array(['knn_jaccard', 'logistic_regression', 'naive_bayes'], dtype=object)

In [17]:
exp_data['resample_algo'].unique()

array(['smote_msfb1', 'smoten'], dtype=object)

In [18]:
exp_data.to_csv(output_loc + tag1 + "_model_comparison_results.csv")