# Covid Rules Lab


implement the PRISM algorithm to extract the classification rules with the highest accuracy and coverage from the hospital patients dataset 

In [1]:
def count_labels(rows):
    label_count = {}
    labels = rows["outcome"].unique()
    for label in labels:
        label_count[label]=rows[rows["outcome"]==label].shape[0]
    
    return label_count
# here we will get a dictionary 
# which {(label,3),(label,4),...}
# such that label is the different names of attributes like: red, blue, green and the number is the occurence

def split(rows, column, value):
    temp_rows = rows.copy()
    if column==None:
        return rows
   
    for i in range(len(column)):
        
        if column[i]!= "age":
        
            temp_rows = temp_rows[temp_rows[column[i]] == value[i]]
            
        else:
            temp_rows = temp_rows[temp_rows[column[i]] >= value[i] ]#for numeric attribute interpret as greater than or equal to
    return temp_rows
# divide rows according to attributes
# if the attribute is a number, divide the rows by >= number and == number
# if the attribute is not a number, divide by rows == or not ==

def coverage(rows, column, value):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    """
    temp_rows = split(rows, column, value)
    coverage = temp_rows.shape[0] #rows that has this sepcific value like 'red'
    # and the specific attribute
    return coverage

def cal_accuracy(rows, column, value, c,attribute):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    attribute: like 'T', 'F'
    """
    temp_rows = split(rows, column, value)
    
    true_rows_len = temp_rows[temp_rows[attribute]==c].shape[0]
    if true_rows_len == 0:
        return 0
    total = temp_rows.shape[0]
    
    accuracy = true_rows_len/total
    
    return accuracy

class Rule:
    def __init__(self, accuracy = None, attributes = None, covered=None, value = None, label=None):
        self.accuracy = accuracy
        self.attributes = attributes
        self.covered = covered # the rows that are covered by this rule
        self.value = value #format as (0, 'female')
        self.label = label #the labels of each sub-rules. like ['yes','no',...]

In [5]:
def generate_rule(rows, columns_list,score_func1=cal_accuracy, score_func2=coverage, accu_thresh=0, cover_thresh=0):
    #print(score_func2(rows,['asthma', 'imm_supr', 'sex'], ['yes', 'yes', 'female']))
    if len(rows) == 0:
        print("Done!")
        return
    labels =count_labels(rows)
    
    all_label_best_rule = Rule(label=labels,attributes=[],value = [])
    all_label_best_rule.accuracy = 0
    
    for label in labels:
        one_label_best_rule = Rule(label=label,attributes=[],value = [])
        one_label_best_rule.accuracy = 0
        
        #Create a rule R with an empty left-hand side that predicts class C
        R = Rule(label=label,attributes=[],value = [],accuracy=0)  # the labels of each rule. like 'alive','dead',...
        #R.accuracy = score_func1(rows, R.attributes, R.value, R.label,columns_list[-1])#find_accuracy(data, rule, c, attr[-1])
        #R.covered = score_func2(rows,R.attributes, R.value)
        remainder_cols = columns_list[:-1].copy()
        rows_copy = rows[rows[columns_list[-1]]== label].copy()
        cycle_time = 0
        while R.accuracy< accu_thresh:
            cycle_time+=1
            for col in remainder_cols:
                values = rows_copy[col].unique() 
                for val in values:
                    R_att = R.attributes.copy()
                    R_val = R.value.copy()
                    R_att.append(col)
                    R_val.append(val)
                    
                    new_R = Rule(attributes = R_att, value = R_val, label=label)
                    new_R.accuracy = score_func1(rows, new_R.attributes, new_R.value, label, columns_list[-1])
                    new_R.covered = score_func2(rows,new_R.attributes, new_R.value)
                    
                    if new_R.accuracy >= one_label_best_rule.accuracy and new_R.covered >= cover_thresh:
                        if new_R.accuracy == one_label_best_rule.accuracy: 
                            if new_R.covered > one_label_best_rule.covered:
                                one_label_best_rule= new_R
                        else:one_label_best_rule= new_R
                    else:pass
            
            flag = False     
            if one_label_best_rule.accuracy > R.accuracy:
                flag = True
            elif one_label_best_rule.accuracy==R.accuracy:
                if one_label_best_rule.covered>= R.covered:
                    flag = True
            if flag:
                R=one_label_best_rule
                if len(R.attributes)==0:
                    pass
                 
                elif R.attributes[-1] in remainder_cols:
                    remainder_cols.remove(R.attributes[-1])
                if R.accuracy>= accu_thresh and R.covered>= cover_thresh:
                    if R.accuracy > all_label_best_rule.accuracy:
                        all_label_best_rule = R
                    elif R.accuracy == all_label_best_rule.accuracy and R.covered >= all_label_best_rule.covered:
                        all_label_best_rule = R
            if len(remainder_cols)==0 or cycle_time>len(remainder_cols):
                break
            
    
    if all_label_best_rule.accuracy != 0:
        return all_label_best_rule

In [7]:
data_file = "/Users/elaine/Desktop/ML2020labs/covid_categorical_good.csv"

import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")

def recursive_rules(rows, generate_rule):
    columns_list = rows.columns.to_numpy().tolist()
    rows_copy = rows.copy()
    flag = False
    while len(rows_copy) != 0:
        one_rule = generate_rule(rows_copy,columns_list, score_func1=cal_accuracy, score_func2=coverage, accu_thresh=0.8, cover_thresh=30)
        if one_rule == None:
            break
        else:
            flag = True
            #print(one_rule.accuracy,one_rule.label,one_rule.attributes,one_rule.covered,one_rule.value)
            for i in range(len(one_rule.attributes)):
                print("if ", one_rule.attributes[i], one_rule.value[i])
            print("then ", one_rule.label)
            print("coverage: ", one_rule.covered)
            print("accuracy: ", one_rule.accuracy)
            print("\n")
            rows_best = split(rows_copy, one_rule.attributes,one_rule.value)
            rows_copy = rows_copy[~rows_copy.index.isin(rows_best.index)]
    if not flag:
        print("threshold is too high or no available rules left")


In [8]:
recursive_rules(data, generate_rule)

if  hypertension no
then  alive
coverage:  175108
accuracy:  0.9118543984283984


if  asthma yes
then  alive
coverage:  1447
accuracy:  0.8127159640635798


if  diabetes no
if  sex female
then  alive
coverage:  10678
accuracy:  0.8185989885746394




# reprot 

this is the rules i got for the coverage 20, accuracy 0.9

if  hypertension no
then  alive
coverage:  175108
accuracy:  0.9118543984283984


if  asthma yes
if  imm_supr yes
if  sex female
then  alive
coverage:  63
accuracy:  0.9047619047619048


if  asthma yes
if  diabetes no
if  sex female
if  tobacco yes
then  alive
coverage:  26
accuracy:  0.9615384615384616


if  asthma yes
if  sex female
if  tobacco yes
then  alive
coverage:  22
accuracy:  0.9545454545454546


if  asthma yes
if  diabetes no
if  sex female
if  obesity no
if  cardiovascular no
if  age 71
then  alive
coverage:  23
accuracy:  0.9130434782608695

i found that the rules mostly tell me what conditions are for the 'alive' result rather than the 'dead' result.
i found that the some of the rules match what we know about covid (if a person has serious illness then this person will probably die) for example, the first rule says that if a person doesn't have hypertension then this person has higher than 0.9 change to survive.
however, i also found that if a person has asthma and uses tobacco this person will probably survive. and this is very strange. 


