In [None]:
# Code to run simulations on synthetic data to test the accuracy of the new algorithm

In [1]:
import numpy as np
import pandas as pd
import pickle
import sys
import gc
import os
import matplotlib.pyplot as plt
import glob

In [2]:
# Base path
tag1 = "polarity"
path = "/domino/datasets/local/smote_msfb/public_datasets/" + tag1 ## Path of the input storage data
output_loc = "/repos/smote_msfb/public_datasets/" + tag1

In [3]:
# Get all matches that start with "Modelling_Datasets_*"
all_matches = glob.glob(os.path.join(path, "target*"))

# Keep only the matches that are directories
matching_folders = [f for f in all_matches if os.path.isdir(f)]

# Print the matching folders
for folder in matching_folders:
    print(folder)

/domino/datasets/local/smote_msfb/public_datasets/polarity/target


In [4]:
# Known classifier algorithms (lowercase, underscore style)
known_classifiers = ["logistic_regression", "naive_bayes", "knn_jaccard"]

In [13]:
# Datasets to process
file_names = matching_folders

# Initialize dataframe
exp_data = pd.DataFrame()

# Loop through datasets
for file_name in file_names:  
             
    current_folder_path = os.path.join(file_name)
    
    print("Current Folder Path :", current_folder_path)
        
    if not os.path.exists(current_folder_path):             
        continue

    # Get files starting with the correct prefix
    files = [
        f for f in os.listdir(current_folder_path) 
        if f.startswith("Cross_Validation_metrics_") 
    ]
         
    for f in files:
        print("Current file processing :", f)
        file_path = os.path.join(current_folder_path, f)
        if not os.path.exists(file_path):
            continue         

        output_data = pd.read_csv(file_path)

        # Strip off file extension and prefix
        f_base = f.rsplit(".", 1)[0]
        prefix = "Cross_Validation_metrics_"
        if f_base.startswith(prefix):
            full_algo_str = f_base[len(prefix):]

            matched_clf = None
            for clf in known_classifiers:
                if full_algo_str.endswith(clf):
                    matched_clf = clf
                    break

            if matched_clf:
                resample_algo_f = full_algo_str[:-(len(matched_clf) + 1)]  # remove '_' + classifier
                model_algo = matched_clf
            else:
                resample_algo_f = full_algo_str
                model_algo = "unknown"

            output_data["resample_algo"] = resample_algo_f
            output_data["model_algo"] = model_algo

        output_data['file_name'] = file_name[file_name.rfind('/') + 1:] 
        exp_data = pd.concat([exp_data, output_data], ignore_index=True)

Current Folder Path : /domino/datasets/local/smote_msfb/public_datasets/polarity/target
Current file processing : Cross_Validation_metrics_smoten_naive_bayes.zip
Current file processing : Cross_Validation_metrics_smoten_logistic_regression.zip
Current file processing : Cross_Validation_metrics_smoten_knn_jaccard.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_logistic_regression.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_knn_jaccard.zip
Current file processing : Cross_Validation_metrics_smote_msfb1_naive_bayes.zip


In [19]:
exp_data['dataset_name'] = tag1

In [20]:
exp_data.head(9)

Unnamed: 0,Avg_Precision,Avg_Recall,Avg_ROC_AUC,Avg_Gmean,resample_algo,model_algo,file_name,Avg_f1_score,dataset_name
4,0.23631,0.21,0.7134,0.378982,smote_msfb1,knn_jaccard,target,0.11119,polarity
3,1.0,0.09,0.86616,0.293916,smote_msfb1,logistic_regression,target,0.082569,polarity
5,0.416667,0.03,0.63586,0.120569,smote_msfb1,naive_bayes,target,0.027985,polarity
2,0.475,0.09,0.71136,0.292348,smoten,knn_jaccard,target,0.075664,polarity
1,0.4,0.08,0.76524,0.235654,smoten,logistic_regression,target,0.066667,polarity
0,0.118687,0.05,0.47166,0.183389,smoten,naive_bayes,target,0.03518,polarity


In [21]:
exp_data['Avg_f1_score'] = 1 / ( (1/exp_data['Avg_Precision']) + (1/exp_data['Avg_Recall']) )
exp_data = exp_data.sort_values(by=['resample_algo', 'model_algo'])

In [22]:
exp_data['model_algo'].unique()

array(['knn_jaccard', 'logistic_regression', 'naive_bayes'], dtype=object)

In [23]:
exp_data['resample_algo'].unique()

array(['smote_msfb1', 'smoten'], dtype=object)

In [24]:
exp_data.to_csv(output_loc + tag1 + "_model_comparison_results.csv")