In [12]:
import pandas as pd
import numpy as np

data = pd.read_excel('COVID_Data_Small.xlsx')
data.head()

Unnamed: 0,Arrival_Date,AgeYears,Temperature,BMI,AvgReading_Neuts_pct,Respiration_Rate,HasConnectiveTissueDisorderFLG,O2_Saturation,MaleFLG,Outcome_48Hours_Dispo,Rand
0,2020-05-28,75.833333,98.0,36.3,75.6,98,1,99,1,0,0.05592
1,2020-03-10,0.916667,102.0,34.6,71.2,70,0,98,0,1,0.112461
2,2020-05-29,61.083333,97.5,25.3,68.4,61,1,99,1,0,0.031409
3,2020-05-11,56.166667,97.9,30.3,96.5,56,0,98,1,1,0.720525
4,2020-06-19,41.916667,97.0,34.1,53.7,56,0,100,0,0,0.047407


In [13]:
numeric_data = data.select_dtypes(include=['number'])

correlation = numeric_data.corr()['Outcome_48Hours_Dispo'].sort_values(ascending=False)
print(correlation)

Outcome_48Hours_Dispo             1.000000
Rand                              0.776004
AvgReading_Neuts_pct              0.245144
Respiration_Rate                  0.170397
HasConnectiveTissueDisorderFLG    0.117970
AgeYears                          0.072519
Temperature                       0.064232
MaleFLG                           0.030917
BMI                               0.001608
O2_Saturation                    -0.138634
Name: Outcome_48Hours_Dispo, dtype: float64


In [14]:
feature_columns = ['AgeYears', 'Temperature', 'BMI', 'AvgReading_Neuts_pct', 'Respiration_Rate',
                   'O2_Saturation', 'MaleFLG']

def find_best_rule(data, target_col, feature_cols):
    best_accuracy = 0
    best_rule = None

    for feature in feature_cols:
        thresholds = data[feature].unique()
        for threshold in thresholds:
            rule = data[feature] > threshold
            accuracy = np.mean(rule == data[target_col])
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_rule = (feature, threshold, '>')

            rule = data[feature] < threshold
            accuracy = np.mean(rule == data[target_col])
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_rule = (feature, threshold, '<')

    return best_rule, best_accuracy

best_rule, best_accuracy = find_best_rule(data, 'Outcome_48Hours_Dispo', feature_columns)
print(f"Best single feature rule: {best_rule} with accuracy: {best_accuracy:.2f}")

Best single feature rule: ('AvgReading_Neuts_pct', 82.5, '>') with accuracy: 0.64


In [15]:
from itertools import combinations

In [16]:
selected_features = ['AvgReading_Neuts_pct', 'Respiration_Rate', 'O2_Saturation']

In [17]:
def find_best_combination_rule(data, target_col, feature_cols):
    best_accuracy = 0
    best_rule = None

    for feature1, feature2 in combinations(feature_cols, 2):
        thresholds1 = data[feature1].unique()
        thresholds2 = data[feature2].unique()

        for threshold1 in thresholds1:
            for threshold2 in thresholds2:
                rule = (data[feature1] > threshold1) & (data[feature2] > threshold2)
                accuracy = np.mean(rule == data[target_col])
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_rule = (feature1, threshold1, '>', feature2, threshold2, '>')

                rule = (data[feature1] < threshold1) & (data[feature2] < threshold2)
                accuracy = np.mean(rule == data[target_col])
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_rule = (feature1, threshold1, '<', feature2, threshold2, '<')

    return best_rule, best_accuracy

best_comb_rule, best_comb_accuracy = find_best_combination_rule(data, 'Outcome_48Hours_Dispo', selected_features)
print(f"Best double rule: {best_comb_rule} with accuracy: {best_comb_accuracy:.2f}")

Best double rule: ('AvgReading_Neuts_pct', 73.6, '>', 'Respiration_Rate', 19, '>') with accuracy: 0.65
