# Covid Rules Lab


implement the PRISM algorithm to extract the classification rules with the highest accuracy and coverage from the hospital patients dataset 

In [118]:
import random

In [119]:
def count_labels(rows):
    label_count = {}
    for row in rows:
        # The class label is in the last column
        label = row[- 1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    return label_count
# here we will get a dictionary 
# which {(label,3),(label,4),...}
# such that label is the different names of attributes like: red, blue, green and the number is the occurence 

In [120]:
def split(rows, column, value):
    # define split function according to the value type
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    # Divide the rows into two sets and return them
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1, set2)

# divide rows according to attributes
# if the attribute is a number, divide the rows by >= number and == number
# if the attribute is not a number, divide by rows == or not ==

In [121]:
def coverage(rows, column, value):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    """
    (set1, set2) = split(rows, column, value)
    coverage = len(set1) #rows that has this sepcific value like 'red'
    # and the specific attribute
    return coverage

In [122]:
def accuracy(rows, column, value, attribute):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    attribute: like 'T', 'F'
    """
    (set1, set2) = split(rows, column, value)
    true_rows = []
    for row in set1:
        if row[-1]==attribute:
            true_rows+=[row]
    total = len(set1) #rows that has this sepcific value like 'red'
    num_true = len(true_rows) #number of rows that has this specific condition like 'red' 
    # and the specific attribute
    accuracy = num_true/total
    return accuracy

In [123]:
class Rule:
    def __init__(self, accuracy = None, coverage = None, covered_rows=None, rule=None, label=None):
        self.accuracy = accuracy
        self.coverage = coverage
        self.covered_rows = covered_rows # the rows that are covered by this rule
        self.rule = rule #format as (0, 'female')
        self.label = label #the labels of each sub-rules. like ['yes','no',...]

In [146]:
def generate_rule(rows, score_func1=accuracy, score_func2=coverage, accu_thresh=0, cover_thresh=0):
    if len(rows) == 0:
        return print("Done!")

    the_big_rule = {}  # the big rule (one total rule) like if 'sex' == 'female' and 'age' >= 50, 'alive'
    big_rule_labels = []  # the labels of each sub-rule. like ['yes','no',...]
    best_accu_big = 0
    best_cover_big = 0

    labels = count_labels(rows)
    for label, count in labels.items():
        best_rule_total = {}
        rule_label = None  # the labels of each rule. like 'alive','dead',...
        best_accu = 0
        best_cover = 0
        best_rule = None
        col_already = []  # this indicates the columns that are covered by sub-rules already
        labels = count_labels(rows)
        (rows_copy, other) = split(rows, -1 , label)
        column_count = len(rows_copy[0]) - 1
        for col in range(0, column_count):
            column_values = set()
            for row in rows_copy:
                column_values.add(row[col])

                # test each unique value in this column as a rule
            for value in column_values:
                # Evaluate the quality of the rule
                accuracy = score_func1(rows_copy, col, value, label)
                coverage = score_func2(rows_copy, col, value)

                if coverage > cover_thresh and accuracy > best_accu:
                    best_rule = (col, value)
                elif coverage > cover_thresh and accuracy == best_accu and coverage > best_cover:
                    best_rule = (col, value)
                elif coverage > cover_thresh and accuracy == best_accu and coverage == best_cover:
                    random_choice = random.choice([0, 1])
                    if random_choice == 1:
                        best_rule = (col, value)
        best_accu = accuracy
        best_cover = coverage
        col_already += [best_rule[0]]
        (rows_copy, other) = split(rows_copy, best_rule[0] , best_rule[1])
        best_rule_total[best_rule[0]] = best_rule[1]

        # use recursive algorithm to add criterion so that the accuracy can increase
        while best_accu < accu_thresh:
            best_rule = None
            column_count = len(rows_copy[0]) - 1
            for col in range(0, column_count):
                if col not in col_already:
                    column_values = set()
                    for row in rows_copy:
                        column_values.add(row[col])

                    # test each unique value in this column as a rule
                    for value in column_values:
                        # Evaluate the quality of the rule
                        accuracy = score_func1(rows_copy, col, value, label)
                        coverage = score_func2(rows_copy, col, value)

                        if coverage > cover_thresh and accuracy > best_accu:
                            best_accu = accuracy
                            best_cover = coverage
                            best_rule = (col, value)
                            col_already += [best_rule[0]]
                            rule_label = label
                            best_rule_total[best_rule[0]] = best_rule[1]
                            (rows_copy, other) = split(rows_copy, best_rule[0], best_rule[1])
                        elif coverage > cover_thresh and accuracy == best_accu and coverage > best_cover:
                            best_accu = accuracy
                            best_cover = coverage
                            best_rule = (col, value)
                            rule_label = label
                            col_already += [best_rule[0]]
                            best_rule_total[best_rule[0]] = best_rule[1]
                            (rows_copy, other) = split(rows_copy, best_rule[0], best_rule[1])
                        elif coverage > cover_thresh and accuracy == best_accu and coverage == best_cover:
                            random_choice = random.choice([0, 1])
                            if random_choice == 1:
                                best_accu = accuracy
                                best_cover = coverage
                                best_rule = (col, value)
                                rule_label = label
                                col_already += [best_rule[0]]
                                best_rule_total[best_rule[0]] = best_rule[1]
                                (rows_copy, other) = split(rows_copy, best_rule[0], best_rule[1])

        if best_accu > best_accu_big:
            best_accu_big = best_accu
            best_cover_big = best_cover
            best_rule_total[best_rule[0]]= best_rule[1]
            big_rule_labels = label
            he_big_rule = best_rule_total

        elif best_accu > best_accu_big and best_cover > best_cover_big:
            best_accu_big = best_accu
            best_cover_big = best_cover
            the_big_rule=best_rule_total
            big_rule_labels = label

        elif best_accu == best_accu_big and best_cover == best_cover_big:
            random_choice = random.choice([0, 1])
            if random_choice == 1:
                best_accu_big = best_accu
                best_cover_big = best_cover
                the_big_rule = best_rule_total
                big_rule_labels = label

    if the_big_rule != {}:
        refined_set = rows
        for single_rule in the_big_rule:
            (refined_set, other) = split(refined_set, single_rule[0], single_rule[1])
        new_rule = Rule(accuracy=best_accu_big, coverage=best_cover_big, covered_rows=refined_set, rule=the_big_rule, label= big_rule_label)
        return new_rule

    else:
        print("threshold is too high or no available rules left")
        return None

In [125]:
data_file = "/Users/elaine/Desktop/ML2020labs/covid_categorical_good.csv"

In [126]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data.columns
data

Unnamed: 0,sex,age,diabetes,copd,asthma,imm_supr,hypertension,cardiovascular,obesity,renal_chronic,tobacco,outcome
0,male,27,no,no,no,no,no,no,no,no,no,alive
1,male,24,no,no,no,no,no,no,no,no,no,alive
2,female,54,no,no,no,no,no,no,yes,no,no,alive
3,male,30,no,no,no,no,no,no,no,no,no,alive
4,female,60,yes,no,no,no,yes,yes,no,no,no,dead
...,...,...,...,...,...,...,...,...,...,...,...,...
219174,female,88,yes,no,no,no,yes,no,no,no,no,dead
219175,female,30,no,no,no,no,no,no,no,no,no,alive
219176,female,27,no,no,no,no,no,no,no,no,no,alive
219177,female,36,no,no,yes,no,no,no,no,no,no,alive


In [127]:
data_rows = data.to_numpy().tolist()
data_rows = data_rows[0:1000]

In [128]:
columns_list = data.columns.to_numpy().tolist()
print(columns_list)

['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome']


In [153]:
def recursive_rules(rows, generate_rule):
    while len(rows) != 0:
        one_rule = generate_rule(data_rows, score_func1=accuracy, score_func2=coverage, accu_thresh=0.5, cover_thresh=10)
        if one_rule == None:
            break
        else:
            print(one_rule.rule)
            for row in one_rule.covered_rows:
                rows.remove(row)


In [154]:
# call the function
recursive_rules(data_rows, generate_rule)

threshold is too high or no available rules left


In [None]:
report
