In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, cohen_kappa_score
import numpy as np
import pingouin as pg
from re import compile
from bitarray import bitarray
# subgroups -> version 0.1.8
from subgroups.core import Subgroup
from subgroups.data_structures.subgroup_list import SubgroupList
from subgroups.quality_measures.wracc import WRAcc
from subgroups.quality_measures.quality_measure import QualityMeasure

# 1. Reading the dataset

In [2]:
df = pd.read_csv("../mimic-iii-for-experiments.csv")

In [3]:
df

Unnamed: 0,patient_gender,patient_age,readmission,exitus,admission_type,admission_location,discharge_location,hours_between_admission_and_first_icu,culture_month,culture_specimen_type_description,...,previous_exposure_to_carbapenems,previous_exposure_to_fluorquinolones,previous_exposure_to_aminoglycosides,previous_exposure_to_b_lactam,previous_exposure_to_antifungal_agents,mechanical_ventilation_in_previous_admissions,organ_transplant_in_previous_admissions,hematopoietic_transplant_in_previous_admissions,catheter_in_previous_admissions,culture_microorganism_name_AND_susceptibility
0,M,ELDERLY,yes,yes,EMERGENCY,EMERGENCY_ROOM_ADMIT,DEAD/EXPIRED,0,JANUARY,BLOOD_CULTURE,...,no,yes,no,no,yes,no,no,no,no,ENTEROCOCCUS_SP.-R
1,F,ELDERLY,no,yes,EMERGENCY,EMERGENCY_ROOM_ADMIT,DEAD/EXPIRED,22<x<=112,APRIL,URINE,...,no,no,no,no,no,no,no,no,no,ENTEROCOCCUS_SP.-R
2,F,ELDERLY,no,yes,EMERGENCY,EMERGENCY_ROOM_ADMIT,DEAD/EXPIRED,22<x<=112,MAY,CATHETER_TIP-IV,...,no,no,no,no,no,no,no,no,no,ENTEROCOCCUS_SP.-R
3,F,ADULT,yes,no,EMERGENCY,EMERGENCY_ROOM_ADMIT,HOME,0,MARCH,URINE,...,no,yes,yes,no,yes,yes,no,no,yes,ENTEROCOCCUS_SP.-R
4,M,ELDERLY,yes,yes,EMERGENCY,TRANSFER_FROM_HOSP/EXTRAM,DEAD/EXPIRED,112<x,JANUARY,BLOOD_CULTURE,...,no,yes,no,no,no,no,no,no,no,ENTEROCOCCUS_SP.-R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3739,F,ADULT,no,no,EMERGENCY,CLINIC_REFERRAL/PREMATURE,LONG_TERM_CARE_HOSPITAL,0,AUGUST,SPUTUM,...,no,no,no,no,no,no,no,no,no,noTarget
3740,M,ELDERLY,no,no,EMERGENCY,EMERGENCY_ROOM_ADMIT,SNF,0,MARCH,URINE,...,no,no,no,no,no,no,no,no,no,noTarget
3741,F,ELDERLY,no,no,EMERGENCY,EMERGENCY_ROOM_ADMIT,REHAB/DISTINCT_PART_HOSP,0,JUNE,SPUTUM,...,no,no,no,no,no,no,no,no,no,noTarget
3742,M,ADULT,no,no,EMERGENCY,EMERGENCY_ROOM_ADMIT,DISC-TRAN_CANCER/CHLDRN_H,22<x<=112,OCTOBER,SPUTUM,...,no,no,no,no,no,no,no,no,no,noTarget


In [4]:
df.columns

Index(['patient_gender', 'patient_age', 'readmission', 'exitus',
       'admission_type', 'admission_location', 'discharge_location',
       'hours_between_admission_and_first_icu', 'culture_month',
       'culture_specimen_type_description', 'service_when_culture',
       'icu_when_culture', 'treated_with_vancomycin_in_previous_admissions',
       'treated_with_vancomycin_in_the_current_admission_before_culture',
       'previous_colonization_with_enterobacterales_cre',
       'previous_colonization_with_pseudomonas_aeruginosa',
       'previous_colonization_with_acinetobacter_baumannii',
       'previous_colonization_with_sarm',
       'previous_colonization_with_stenotrophomonas_maltophilia',
       'previous_colonization_with_erv',
       'previous_exposure_to_third_generation_cephalosporins',
       'previous_exposure_to_carbapenems',
       'previous_exposure_to_fluorquinolones',
       'previous_exposure_to_aminoglycosides', 'previous_exposure_to_b_lactam',
       'previous_expo

In [5]:
print("Number of instances: " + str(len(df)))
print("Number of attributes: " + str(len(df.columns)))

Number of instances: 3744
Number of attributes: 31


In [6]:
target = ("culture_microorganism_name_AND_susceptibility", "ENTEROCOCCUS_SP.-R")
TP = (df[target[0]] == target[1]).sum()
FP = (df[target[0]] != target[1]).sum()

In [7]:
print("Number of instances in which the target is true: " + str(TP) + ".")
print("Number of instances in which the target is false: " + str(FP) + ".")

Number of instances in which the target is true: 1872.
Number of instances in which the target is false: 1872.


# 2. Statistical validation

In [8]:
def load_diverse_topk_subgroup_lists(dataset : pd.DataFrame,
                                     TP : int,
                                     FP : int,
                                     target_attribute_name : str,
                                     target_attribute_value : str,
                                     diverse_topk_subgroup_lists_file_path : str,
                                     quality_measure : QualityMeasure) -> (list, float, float):
    sl_index = 0
    sl_quality_dict = dict()
    sl_coverage = pd.Series([False]*len(dataset))
    list_of_subgroup_lists = []
    dataset_mask = (dataset[target_attribute_name] == target_attribute_value)
    number_of_dataset_instances = len(dataset)
    input_file = open(diverse_topk_subgroup_lists_file_path, "r")
    regex_object_sl_header = compile("^## Subgroup list (?P<n_subgroups>\(.*\)) ##$")
    regex_object_subgroup = compile("^s(?P<number>[0-9]+): Description: (?P<description>.+), Target: (?P<target>.+)$")
    for line in input_file:
        match_object_1 = regex_object_sl_header.fullmatch(line.rstrip("\n"))
        match_object_2 = regex_object_subgroup.fullmatch(line.rstrip("\n"))
        if match_object_1:
            list_of_subgroup_lists.append( SubgroupList(bitarray(dataset_mask.tolist(), endian = "big"), bitarray((~dataset_mask).tolist(), endian = "big"), number_of_dataset_instances) )
            sl_index = sl_index + 1
            sl_quality_dict[sl_index] = 0
        elif match_object_2:
            subgroup = Subgroup.generate_from_str("Description: " + match_object_2.group("description") + ", Target: " + match_object_2.group("target"))
            tp_Series, fp_Series, _  = subgroup.filter(dataset)
            list_of_subgroup_lists[-1].add_subgroup(subgroup, bitarray(tp_Series.tolist(), endian = "big"), bitarray(fp_Series.tolist(), endian = "big"))
            subgroup_qm = quality_measure.compute({QualityMeasure.TRUE_POSITIVES : tp_Series.sum(), QualityMeasure.FALSE_POSITIVES : fp_Series.sum(), QualityMeasure.TRUE_POPULATION : TP, QualityMeasure.FALSE_POPULATION : FP})
            sl_quality_dict[sl_index] = sl_quality_dict[sl_index] + subgroup_qm
            sl_coverage = sl_coverage | tp_Series | fp_Series
    input_file.close()
    return list_of_subgroup_lists, sum(sl_quality_dict.values())/len(sl_quality_dict.values()), sl_coverage.sum()/len(dataset)

In [9]:
def generateClassifier(\
  input_nominal_dataset : pd.DataFrame, target_param : tuple, n_param : int, test_size_param : float, random_seed_param : int,\
  numerical_dataset_to_join : pd.DataFrame, classifier : str) -> dict:
    result = {}
    # Copy input_nominal_dataset.
    numerical_dataset = input_nominal_dataset.copy()
    # Apply LabelEncoder.
    le = LabelEncoder()
    for column in numerical_dataset.columns.drop(target_param[0]):
        numerical_dataset[column] = le.fit_transform(numerical_dataset[column])
    # Add columns of the other dataset.
    if numerical_dataset_to_join is not None:
        for column in numerical_dataset_to_join.columns:
            numerical_dataset[column] = numerical_dataset_to_join[column]
    # Divide into X and y.
    X = numerical_dataset.drop(columns=[target_param[0]])
    y = numerical_dataset[[target_param[0]]]
    # Split dataset.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_param, random_state=random_seed_param, stratify=y)
    result["rf_dataset_train"] = pd.concat([X_train, y_train], axis = 1) 
    result["rf_dataset_test"] = pd.concat([X_test, y_test], axis = 1)
    if classifier == "RandomForest":
        clf = RandomForestClassifier(n_estimators=n_param, random_state=random_seed_param)
    elif classifier == "LogisticRegression":
        clf = LogisticRegression(random_state=random_seed_param, max_iter=1000)
    else:
        print("ERROR!!!!!")
        return None
    # Fit.
    clf.fit(X_train, y_train[target_param[0]].to_list())
    # Get accuracies.
    result["train_data_accuracy"] = clf.score(X_train, y_train)
    result["test_data_accuracy"] = clf.score(X_test, y_test)
    # Get F1 score.
    result["train_data_f1_score"] = f1_score(y_train, clf.predict(X_train), pos_label=target_param[1])
    result["test_data_f1_score"] = f1_score(y_test, clf.predict(X_test), pos_label=target_param[1])
    # Get kappa score.
    result["train_data_kappa_score"] = cohen_kappa_score(y_train, clf.predict(X_train))
    result["test_data_kappa_score"] = cohen_kappa_score(y_test, clf.predict(X_test))
    # Get confusion matrix disp.
    y_test_pred = clf.predict(X_test)
    result["test_confusion_matrix"] = confusion_matrix(y_test, y_test_pred)
    result["test_confusion_matrix_display"] = ConfusionMatrixDisplay(confusion_matrix=result["test_confusion_matrix"], display_labels=clf.classes_)
    # Get tp, fp, fn and tn.
    result["test_tp"] = result["test_confusion_matrix"][0][0]
    result["test_fp"] = result["test_confusion_matrix"][1][0]
    result["test_fn"] = result["test_confusion_matrix"][0][1]
    result["test_tn"] = result["test_confusion_matrix"][1][1]
    result["test_TP"] = (sum(y_test[target[0]] == target[1]))
    result["test_FP"] = (sum(y_test[target[0]] != target[1]))
    # Get score vector.
    result["test_score_vector"] = (y_test[target_param[0]].values == y_test_pred).astype(int).tolist()
    return result

## 2.1. Best diverse top-k phenotypes according to the quality

In [10]:
list_of_subgroup_lists, quality, coverage = load_diverse_topk_subgroup_lists(df, TP, FP, target[0], target[1], "../phenotypes/output/0.15_0.6_0.6.txt", WRAcc())

In [11]:
print(quality)
print(coverage)

0.16439636752136752
0.936698717948718


In [12]:
sl1 = list_of_subgroup_lists[0]
print(sl1)

## Subgroup list (6 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', icu_when_culture = 'SICU'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 416
	- negative instances covered: 43
	- total instances covered: 459
	Considering it individually:
	- positive instances covered: 416
	- negative instances covered: 43
	- total instances covered: 459
s2: Description: [culture_specimen_type_description = 'URINE', treated_with_vancomycin_in_the_current_admission_before_culture = 'yes'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 304
	- negative instances covered: 82
	- total instances covered: 386
	Considering it individually:
	- positive instances covered: 304
	- negative instances covered: 82
	- total instances covered: 386
s3: Description: [organ_transplant_in_previ

In [13]:
sl2 = list_of_subgroup_lists[1]
print(sl2)

## Subgroup list (7 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', service_when_culture = 'SURG'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 311
	- negative instances covered: 43
	- total instances covered: 354
	Considering it individually:
	- positive instances covered: 311
	- negative instances covered: 43
	- total instances covered: 354
s2: Description: [culture_specimen_type_description = 'URINE', treated_with_vancomycin_in_previous_admissions = 'yes'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 165
	- negative instances covered: 43
	- total instances covered: 208
	Considering it individually:
	- positive instances covered: 165
	- negative instances covered: 43
	- total instances covered: 208
s3: Description: [hours_between_admission_and_first_icu 

In [14]:
sl3 = list_of_subgroup_lists[2]
print(sl3)

## Subgroup list (6 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', previous_colonization_with_stenotrophomonas_maltophilia = 'no'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 674
	- negative instances covered: 215
	- total instances covered: 889
	Considering it individually:
	- positive instances covered: 674
	- negative instances covered: 215
	- total instances covered: 889
s2: Description: [organ_transplant_in_previous_admissions = 'yes', previous_colonization_with_erv = 'yes', previous_colonization_with_pseudomonas_aeruginosa = 'no'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 68
	- negative instances covered: 7
	- total instances covered: 75
	Considering it individually:
	- positive instances covered: 84
	- negative instances covered: 7
	- total ins

In [15]:
covered_by_sl1 =  sl1.get_subgroup_bitarray_of_positives(0) | sl1.get_subgroup_bitarray_of_negatives(0) | \
                sl1.get_subgroup_bitarray_of_positives(1) | sl1.get_subgroup_bitarray_of_negatives(1) | \
                sl1.get_subgroup_bitarray_of_positives(2) | sl1.get_subgroup_bitarray_of_negatives(2) | \
                sl1.get_subgroup_bitarray_of_positives(3) | sl1.get_subgroup_bitarray_of_negatives(3) | \
                sl1.get_subgroup_bitarray_of_positives(4) | sl1.get_subgroup_bitarray_of_negatives(4) | \
                sl1.get_subgroup_bitarray_of_positives(5) | sl1.get_subgroup_bitarray_of_negatives(5)
not_covered_by_sl1 = sl1.default_rule_bitarray_of_positives | sl1.default_rule_bitarray_of_negatives
assert((covered_by_sl1 | not_covered_by_sl1).count(1) == len(df))
assert((covered_by_sl1 | not_covered_by_sl1).count(0) == 0)

In [16]:
covered_by_sl2 =  sl2.get_subgroup_bitarray_of_positives(0) | sl2.get_subgroup_bitarray_of_negatives(0) | \
                sl2.get_subgroup_bitarray_of_positives(1) | sl2.get_subgroup_bitarray_of_negatives(1) | \
                sl2.get_subgroup_bitarray_of_positives(2) | sl2.get_subgroup_bitarray_of_negatives(2) | \
                sl2.get_subgroup_bitarray_of_positives(3) | sl2.get_subgroup_bitarray_of_negatives(3) | \
                sl2.get_subgroup_bitarray_of_positives(4) | sl2.get_subgroup_bitarray_of_negatives(4) | \
                sl2.get_subgroup_bitarray_of_positives(5) | sl2.get_subgroup_bitarray_of_negatives(5) | \
                sl2.get_subgroup_bitarray_of_positives(6) | sl2.get_subgroup_bitarray_of_negatives(6)
not_covered_by_sl2 = sl2.default_rule_bitarray_of_positives | sl2.default_rule_bitarray_of_negatives
assert((covered_by_sl2 | not_covered_by_sl2).count(1) == len(df))
assert((covered_by_sl2 | not_covered_by_sl2).count(0) == 0)

In [17]:
covered_by_sl3 =  sl3.get_subgroup_bitarray_of_positives(0) | sl3.get_subgroup_bitarray_of_negatives(0) | \
                sl3.get_subgroup_bitarray_of_positives(1) | sl3.get_subgroup_bitarray_of_negatives(1) | \
                sl3.get_subgroup_bitarray_of_positives(2) | sl3.get_subgroup_bitarray_of_negatives(2) | \
                sl3.get_subgroup_bitarray_of_positives(3) | sl3.get_subgroup_bitarray_of_negatives(3) | \
                sl3.get_subgroup_bitarray_of_positives(4) | sl3.get_subgroup_bitarray_of_negatives(4) | \
                sl3.get_subgroup_bitarray_of_positives(5) | sl3.get_subgroup_bitarray_of_negatives(5)
not_covered_by_sl3 = sl3.default_rule_bitarray_of_positives | sl3.default_rule_bitarray_of_negatives
assert((covered_by_sl3 | not_covered_by_sl3).count(1) == len(df))
assert((covered_by_sl3 | not_covered_by_sl3).count(0) == 0)

In [18]:
df_with_covered = pd.DataFrame({"covered_by_sl1" : covered_by_sl1.tolist(), "covered_by_sl2" : covered_by_sl2.tolist(), "covered_by_sl3" : covered_by_sl3.tolist()})
df_with_covered

Unnamed: 0,covered_by_sl1,covered_by_sl2,covered_by_sl3
0,0,0,0
1,1,0,0
2,1,0,0
3,1,1,1
4,0,1,1
...,...,...,...
3739,1,0,0
3740,0,0,0
3741,1,1,1
3742,1,0,1


In [19]:
# Random Forest
result_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = None,\
                                         classifier = "RandomForest")
result_X_sl_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = df_with_covered,\
                                         classifier = "RandomForest")

In [20]:
# Accuracy.
print("Original dataset -> Accuracy with the test data: " + str(result_X["test_data_accuracy"]))
print("Original dataset + subgroup lists -> Accuracy with the test data: " + str(result_X_sl_X["test_data_accuracy"]))

Original dataset -> Accuracy with the test data: 0.7295373665480427
Original dataset + subgroup lists -> Accuracy with the test data: 0.7473309608540926


In [21]:
# p-value
st = pd.DataFrame({"data" : result_X["test_score_vector"], "data_and_sl" : result_X_sl_X["test_score_vector"]})
observed, stats = pg.chi2_mcnemar(st, 'data', 'data_and_sl')
print("p-exact: " + str(stats["p-exact"]))

p-exact: mcnemar    0.044598
Name: p-exact, dtype: float64


In [22]:
# Logistic Regression
result_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = None,\
                                         classifier = "LogisticRegression")
result_X_sl_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = df_with_covered,\
                                         classifier = "LogisticRegression")

In [23]:
# Accuracy.
print("Original dataset -> Accuracy with the test data: " + str(result_X["test_data_accuracy"]))
print("Original dataset + subgroup lists -> Accuracy with the test data: " + str(result_X_sl_X["test_data_accuracy"]))

Original dataset -> Accuracy with the test data: 0.6601423487544484
Original dataset + subgroup lists -> Accuracy with the test data: 0.6672597864768683


In [24]:
# p-value
st = pd.DataFrame({"data" : result_X["test_score_vector"], "data_and_sl" : result_X_sl_X["test_score_vector"]})
observed, stats = pg.chi2_mcnemar(st, 'data', 'data_and_sl')
print("p-exact: " + str(stats["p-exact"]))

p-exact: mcnemar    0.548502
Name: p-exact, dtype: float64


## 2.2. Best diverse top-k phenotypes according to the coverage

In [25]:
list_of_subgroup_lists, quality, coverage = load_diverse_topk_subgroup_lists(df, TP, FP, target[0], target[1], "../phenotypes/output/0.0_0.5_0.6.txt", WRAcc())

In [26]:
print(quality)
print(coverage)

0.15482549857549857
0.9746260683760684


In [27]:
sl1 = list_of_subgroup_lists[0]
print(sl1)

## Subgroup list (6 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', icu_when_culture = 'SICU'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 416
	- negative instances covered: 43
	- total instances covered: 459
	Considering it individually:
	- positive instances covered: 416
	- negative instances covered: 43
	- total instances covered: 459
s2: Description: [culture_specimen_type_description = 'URINE', treated_with_vancomycin_in_the_current_admission_before_culture = 'yes'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 304
	- negative instances covered: 82
	- total instances covered: 386
	Considering it individually:
	- positive instances covered: 304
	- negative instances covered: 82
	- total instances covered: 386
s3: Description: [organ_transplant_in_previ

In [28]:
sl2 = list_of_subgroup_lists[1]
print(sl2)

## Subgroup list (7 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', previous_colonization_with_stenotrophomonas_maltophilia = 'no'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 674
	- negative instances covered: 215
	- total instances covered: 889
	Considering it individually:
	- positive instances covered: 674
	- negative instances covered: 215
	- total instances covered: 889
s2: Description: [culture_specimen_type_description = 'URINE', treated_with_vancomycin_in_previous_admissions = 'yes'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 165
	- negative instances covered: 43
	- total instances covered: 208
	Considering it individually:
	- positive instances covered: 165
	- negative instances covered: 43
	- total instances covered: 208
s3: Description: [org

In [29]:
sl3 = list_of_subgroup_lists[2]
print(sl3)

## Subgroup list (6 subgroups) ##
s1: Description: [culture_specimen_type_description = 'SWAB', hematopoietic_transplant_in_previous_admissions = 'no'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 679
	- negative instances covered: 220
	- total instances covered: 899
	Considering it individually:
	- positive instances covered: 679
	- negative instances covered: 220
	- total instances covered: 899
s2: Description: [culture_specimen_type_description = 'URINE', previous_exposure_to_antifungal_agents = 'yes'], Target: culture_microorganism_name_AND_susceptibility = 'ENTEROCOCCUS_SP.-R'
	Considering its position in the list:
	- positive instances covered: 105
	- negative instances covered: 23
	- total instances covered: 128
	Considering it individually:
	- positive instances covered: 105
	- negative instances covered: 23
	- total instances covered: 128
s3: Description: [organ_transplant_in

In [30]:
covered_by_sl1 =  sl1.get_subgroup_bitarray_of_positives(0) | sl1.get_subgroup_bitarray_of_negatives(0) | \
                sl1.get_subgroup_bitarray_of_positives(1) | sl1.get_subgroup_bitarray_of_negatives(1) | \
                sl1.get_subgroup_bitarray_of_positives(2) | sl1.get_subgroup_bitarray_of_negatives(2) | \
                sl1.get_subgroup_bitarray_of_positives(3) | sl1.get_subgroup_bitarray_of_negatives(3) | \
                sl1.get_subgroup_bitarray_of_positives(4) | sl1.get_subgroup_bitarray_of_negatives(4) | \
                sl1.get_subgroup_bitarray_of_positives(5) | sl1.get_subgroup_bitarray_of_negatives(5)
not_covered_by_sl1 = sl1.default_rule_bitarray_of_positives | sl1.default_rule_bitarray_of_negatives
assert((covered_by_sl1 | not_covered_by_sl1).count(1) == len(df))
assert((covered_by_sl1 | not_covered_by_sl1).count(0) == 0)

In [31]:
covered_by_sl2 =  sl2.get_subgroup_bitarray_of_positives(0) | sl2.get_subgroup_bitarray_of_negatives(0) | \
                sl2.get_subgroup_bitarray_of_positives(1) | sl2.get_subgroup_bitarray_of_negatives(1) | \
                sl2.get_subgroup_bitarray_of_positives(2) | sl2.get_subgroup_bitarray_of_negatives(2) | \
                sl2.get_subgroup_bitarray_of_positives(3) | sl2.get_subgroup_bitarray_of_negatives(3) | \
                sl2.get_subgroup_bitarray_of_positives(4) | sl2.get_subgroup_bitarray_of_negatives(4) | \
                sl2.get_subgroup_bitarray_of_positives(5) | sl2.get_subgroup_bitarray_of_negatives(5) | \
                sl2.get_subgroup_bitarray_of_positives(6) | sl2.get_subgroup_bitarray_of_negatives(6)
not_covered_by_sl2 = sl2.default_rule_bitarray_of_positives | sl2.default_rule_bitarray_of_negatives
assert((covered_by_sl2 | not_covered_by_sl2).count(1) == len(df))
assert((covered_by_sl2 | not_covered_by_sl2).count(0) == 0)

In [32]:
covered_by_sl3 =  sl3.get_subgroup_bitarray_of_positives(0) | sl3.get_subgroup_bitarray_of_negatives(0) | \
                sl3.get_subgroup_bitarray_of_positives(1) | sl3.get_subgroup_bitarray_of_negatives(1) | \
                sl3.get_subgroup_bitarray_of_positives(2) | sl3.get_subgroup_bitarray_of_negatives(2) | \
                sl3.get_subgroup_bitarray_of_positives(3) | sl3.get_subgroup_bitarray_of_negatives(3) | \
                sl3.get_subgroup_bitarray_of_positives(4) | sl3.get_subgroup_bitarray_of_negatives(4) | \
                sl3.get_subgroup_bitarray_of_positives(5) | sl3.get_subgroup_bitarray_of_negatives(5)
not_covered_by_sl3 = sl3.default_rule_bitarray_of_positives | sl3.default_rule_bitarray_of_negatives
assert((covered_by_sl3 | not_covered_by_sl3).count(1) == len(df))
assert((covered_by_sl3 | not_covered_by_sl3).count(0) == 0)

In [33]:
df_with_covered = pd.DataFrame({"covered_by_sl1" : covered_by_sl1.tolist(), "covered_by_sl2" : covered_by_sl2.tolist(), "covered_by_sl3" : covered_by_sl3.tolist()})
df_with_covered

Unnamed: 0,covered_by_sl1,covered_by_sl2,covered_by_sl3
0,0,1,0
1,1,1,0
2,1,1,0
3,1,1,1
4,0,1,1
...,...,...,...
3739,1,1,1
3740,1,0,0
3741,1,0,1
3742,0,1,1


In [34]:
# Random Forest
result_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = None,\
                                         classifier = "RandomForest")
result_X_sl_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = df_with_covered,\
                                         classifier = "RandomForest")

In [35]:
# Accuracy.
print("Original dataset -> Accuracy with the test data: " + str(result_X["test_data_accuracy"]))
print("Original dataset + subgroup lists -> Accuracy with the test data: " + str(result_X_sl_X["test_data_accuracy"]))

Original dataset -> Accuracy with the test data: 0.7295373665480427
Original dataset + subgroup lists -> Accuracy with the test data: 0.751779359430605


In [36]:
# p-value
st = pd.DataFrame({"data" : result_X["test_score_vector"], "data_and_sl" : result_X_sl_X["test_score_vector"]})
observed, stats = pg.chi2_mcnemar(st, 'data', 'data_and_sl')
print("p-exact: " + str(stats["p-exact"]))

p-exact: mcnemar    0.005871
Name: p-exact, dtype: float64


In [37]:
# Logistic Regression
result_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = None,\
                                         classifier = "LogisticRegression")
result_X_sl_X = generateClassifier(input_nominal_dataset = df,\
                                         target_param = target,\
                                         n_param = 100,\
                                         test_size_param = 0.3,\
                                         random_seed_param = 100,\
                                         numerical_dataset_to_join = df_with_covered,\
                                         classifier = "LogisticRegression")

In [38]:
# Accuracy.
print("Original dataset -> Accuracy with the test data: " + str(result_X["test_data_accuracy"]))
print("Original dataset + subgroup lists -> Accuracy with the test data: " + str(result_X_sl_X["test_data_accuracy"]))

Original dataset -> Accuracy with the test data: 0.6601423487544484
Original dataset + subgroup lists -> Accuracy with the test data: 0.6806049822064056


In [39]:
# p-value
st = pd.DataFrame({"data" : result_X["test_score_vector"], "data_and_sl" : result_X_sl_X["test_score_vector"]})
observed, stats = pg.chi2_mcnemar(st, 'data', 'data_and_sl')
print("p-exact: " + str(stats["p-exact"]))

p-exact: mcnemar    0.020584
Name: p-exact, dtype: float64
