# Covid Rules Lab


implement the PRISM algorithm to extract the classification rules with the highest accuracy and coverage from the hospital patients dataset 

In [199]:
import random

In [200]:
def count_labels(rows):
    label_count = {}
    for row in rows:
        # The class label is in the last column
        label = row[- 1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    return label_count
# here we will get a dictionary 
# which {(label,3),(label,4),...}
# such that label is the different names of attributes like: red, blue, green and the number is the occurence 

In [201]:
def split(rows, column, value):
    # define split function according to the value type
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    # Divide the rows into two sets and return them
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1, set2)

# divide rows according to attributes
# if the attribute is a number, divide the rows by >= number and == number
# if the attribute is not a number, divide by rows == or not ==

In [202]:
def coverage(rows, column, value):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    """
    (set1, set2) = split(rows, column, value)
    coverage = len(set1) #rows that has this sepcific value like 'red'
    # and the specific attribute
    return coverage

In [203]:
def accuracy(rows, column, value, attribute):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    attribute: like 'T', 'F'
    """
    (set1, set2) = split(rows, column, value)
    total = len(set1) #rows that has this sepcific value like 'red'
    true_rows = len(set1[set1[-1]==attribute]) #number of rows that has this specific condition like 'red' 
    # and the specific attribute
    
    accuracy = true_rows/total
    return accuracy

In [204]:
class Rule:
    def __init__(self, accuracy = None, coverage = None, rows_after = None, covered_rows=None, rule=None):
        self.accuracy = accuracy
        self.coverage = coverage
        self.cover = covered_rows # the rows that are covered by this rule
        self.rule = rule

In [214]:
def generate_rule(rows, score_func1=accuracy, score_func2=coverage, accu_thresh=0, cover_thresh=0, rules={}, col_selected=[], label_input = None):
    if len(rows) == 0:
        return Rule(accuracy=best_ccuracy, coverage=best_cover, covered_rows=set1, rule=rule)
    if label_input == None:
        labels = count_labels(rows)
    else:
        labels={}
        labels[label] = None
    best_accu = 0
    best_cover = 0
    best_rule = None

    for label, count in labels.items():
        column_count = len(rows[0]) - 1
        for col in range(0, column_count):
            if col not in col_selected and rows[col]!= rows[-1]:
                column_values = set()
                for row in rows:
                    column_values.add(row[col])

                # test each unique value in this column as a rule
                for value in column_values:

                    # Evaluate the quality of the rule
                    accuracy = score_func1(rows, col, value, label)
                    coverage = score_func2(rows, col, value)

                    if coverage > cover_thresh and accuracy > best_accu:
                        best_accu = accuracy
                        best_cover = coverage
                        best_rule = (col, value)
                    elif coverage > cover_thresh and accuracy == best_accu and coverage > best_cover:
                        best_accu = accuracy
                        best_cover = coverage
                        best_rule = (col, value)
                    elif coverage > cover_thresh and accuracy == best_accu and coverage == best_cover:
                        random_choice = random.choice([0, 1])
                        if random_choice == 1:
                            best_accu = accuracy
                            best_cover = coverage
                            best_rule = (col, value)
                            rule_label = label

    # use recursive algorithm to add criterion so that the accuracy can increase    
    if best_rule != None and best_accu < accu_thresh:
        (set1, set2) = split(rows, best_rule[0], best_rule[1])
        covered_set = set1
        col_selected += [best_rule[0]]
        rules[best_rule[0]]=best_rule[1]
        return generate_rule(set1, score_func1, score_func2, accu_thresh, cover_thresh, rules, col_selected, label_input = rule_label)

    elif best_rule != None:  # Done for this rule    
        (set1, set2) = split(rows, best_rule[0], best_rule[1])
        new_rule = Rule(accuracy=best_accu, coverage=best_cover, covered_rows=set1, rule=rules)
        print("Rule:", best_rule)
        return new_rule
    
    else:
        print("If no rule printed, threshold is too high. Please lower the threshold")
        print("no available rules left")
        return 




In [215]:
data_file = "/Users/elaine/Desktop/ML2020labs/covid_categorical_good.csv"

In [216]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data.columns
data

Unnamed: 0,sex,age,diabetes,copd,asthma,imm_supr,hypertension,cardiovascular,obesity,renal_chronic,tobacco,outcome
0,male,27,no,no,no,no,no,no,no,no,no,alive
1,male,24,no,no,no,no,no,no,no,no,no,alive
2,female,54,no,no,no,no,no,no,yes,no,no,alive
3,male,30,no,no,no,no,no,no,no,no,no,alive
4,female,60,yes,no,no,no,yes,yes,no,no,no,dead
...,...,...,...,...,...,...,...,...,...,...,...,...
219174,female,88,yes,no,no,no,yes,no,no,no,no,dead
219175,female,30,no,no,no,no,no,no,no,no,no,alive
219176,female,27,no,no,no,no,no,no,no,no,no,alive
219177,female,36,no,no,yes,no,no,no,no,no,no,alive


In [217]:
data_rows = data.to_numpy().tolist()
data_rows = data_rows[0:5000]
len(data_rows)

5000

In [218]:
columns_list = data.columns.to_numpy().tolist()
print(columns_list)

['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome']


In [221]:
def recursive_rules(rows, generate_rule):
    while len(rows) != 0:
        rule = generate_rule(data_rows, score_func1=accuracy, score_func2=coverage, accu_thresh=0.8, cover_thresh=100, rules={}, col_selected = [], label_input=None)
        if rule == None:
            break
        else:
            for row in rule.cover:
                rows.remove(row)
        
            
    


In [222]:
# call the function
recursive_rules(data_rows, generate_rule)

UnboundLocalError: local variable 'rule_label' referenced before assignment