In [8]:
import numpy as np
import pandas as pd
import os
from collections import Counter 
from collections import Counter 

Calculate the Pearson coefficient for the D- and P- datasets

In [4]:
def coefficient(d, p):
    operator_columns = ["connect_be", "move_be", "create_gate", "delete_gate", "disconnect_be", "cross_over", "delete_be", "change_gate_type", "create_be"]
    operator_counts_list = []

    for _, row in d.iterrows():
        tree = row['Tree']
        evolutions = row['Evolution']
        
        operator_counts = Counter()
        for evolution in evolutions:
            operators = evolution[2]
            operator_counts.update(operators)
        
        operator_counts_dict = {op: operator_counts.get(op, 0) for op in operator_columns}
        operator_counts_dict['Tree'] = tree
        operator_counts_list.append(operator_counts_dict)
        
    operator_counts_df = pd.DataFrame(operator_counts_list)
    
    combined_df = pd.merge(p, operator_counts_df, on="Tree")

    combined_df_no_tree = combined_df.drop(columns=['Tree'])
    p = p.drop(columns=['Tree'])
    
    correlation_matrix = combined_df_no_tree.corr().loc[operator_columns, p.columns]
    
    return correlation_matrix

In order to print the results, we need to run the following code:

In [9]:
# Write the folder with the dataframes. 
# The dataframes should be in the format of the ones saved by the learn_ft.py script
# The default folder, where the required data frames for this test are saved, is: saved_data_frames/d_and_p_dataset
folder_path = '..\\saved_data_frames\\d_and_p_dataset\\COVID_19'

file_list = os.listdir(folder_path)
dataframes = []
filename = os.path.basename(os.path.normpath(folder_path))

for directory in file_list:
    temp_path = os.path.join(folder_path, directory)
    temp_file_list = os.listdir(temp_path)
    temp_dataframes = []
    for file_name in temp_file_list:
        if file_name.endswith('.pkl'):
            file_path = os.path.join(temp_path, file_name)
            df = pd.read_pickle(file_path)
            temp_dataframes.append(df)
    dataframes.append(temp_dataframes)

matrices = []

for i in range(len(dataframes)):
    d = dataframes[i][0]
    p = dataframes[i][1]
    matrices.append(coefficient(d, p))
    
concat_df = pd.concat(matrices, axis=0, keys=range(len(matrices)))

mean_df = concat_df.groupby(level=1).mean()

std_df = concat_df.groupby(level=1).std()

se_df = std_df / np.sqrt(len(dataframes))

t_value = t.ppf(1-0.025, df=len(dataframes)-1)

margin_of_error_df = se_df * t_value

lower_bound_df = mean_df - margin_of_error_df
upper_bound_df = mean_df + margin_of_error_df

summary_df = mean_df.copy()
for col in mean_df.columns:
    summary_df[col] = list(zip(mean_df[col], lower_bound_df[col], upper_bound_df[col]))

print(summary_df.to_string())


                                                                                spec                                                                    npv                                                                 prec                                                                 mcc                                                                  acc                                                                   s                                                                   dor
change_gate_type    (0.01564999332098381, -0.07831741463253027, 0.10961740127449789)     (-0.015185280950645402, -0.11100815842241343, 0.08063759652112262)  (-0.0007851345740623378, -0.09492389961051434, 0.09335363046238968)  (-0.0007698993292721181, -0.08608291609630943, 0.0845431174377652)   (-0.014342514586721197, -0.10447992775810619, 0.07579489858466379)  (0.056377395858926746, -0.008574937147252437, 0.12132972886510593)       (-0.02612080848527178, -0.172788200984232, 0.12054658401368844)
