In [None]:
def count_labels(rows):
    label_count = {}
    labels = rows["outcome"].unique()
    for label in labels:
        label_count[label]=rows[rows["outcome"]==label].shape[0]
    
    return label_count

# here we will get a dictionary 
# which {(label,3),(label,4),...}
# such that label is the different names of attributes like: red, blue, green and the number is the occurence

In [None]:
def split(rows, column, value):
    temp_rows = rows.copy()
    if column==None:
        return rows
   
    for i in range(len(column)):
        if isinstance(rows[column[i]], int) or isinstance(rows[column[i]], float):
            temp_rows = temp_rows[temp_rows[column[i]] >= value[i] ]#for numeric attribute interpret as greater than or equal to
        else:
            temp_rows = temp_rows[temp_rows[column[i]] == value[i]]
    return temp_rows

# divide rows according to attributes
# if the attribute is a number, divide the rows by >= number and == number
# if the attribute is not a number, divide by rows == or not ==

In [None]:
def coverage(rows, column, value):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    """
    temp_rows = split(rows, column, value)
    coverage = temp_rows.shape[0] #rows that has this sepcific value like 'red'
    # and the specific attribute
    return coverage

In [None]:
def cal_accuracy(rows, column, value, c,attribute):
    """
    rows: all the rows left
    column: the specific column where value belongs to
    value: one trait that we are evaluating like 'red'
    attribute: like 'T', 'F'
    """
    temp_rows = split(rows, column, value)
    
    true_rows_len = temp_rows[temp_rows[attribute]==c].shape[0]
    if true_rows_len == 0:
        return 0
    total = temp_rows.shape[0]
    
    accuracy = true_rows_len/total
    
    return accuracy

In [None]:
class Rule:
    def __init__(self, accuracy = None, attributes = None, covered=None, value = None, label=None):
        self.accuracy = accuracy
        self.attributes = attributes
        self.covered = covered # the rows that are covered by this rule
        self.value = value #format as (0, 'female')
        self.label = label #the labels of each sub-rules. like ['yes','no',...]

In [None]:
def generate_rule(rows, columns_list,score_func1=cal_accuracy, score_func2=coverage, accu_thresh=0, cover_thresh=0):
    if len(rows) == 0:
        print("Done!")
        return

    all_rules = []
    labels = count_labels(rows)
    for label in labels:
        new_R = Rule(attributes = [], value = [], label=label)
        new_R.accuracy = score_func1(rows, new_R.attributes, new_R.value, label, columns_list[-1])
        new_R.covered = score_func2(rows,new_R.attributes, new_R.value)
        all_rules.append(new_R)
        
        R = Rule(label=label,attributes=[],value = [])  # the labels of each rule. like 'alive','dead',...
        R.accuracy = score_func1(rows, R.attributes, R.value, R.label,columns_list[-1])#find_accuracy(data, rule, c, attr[-1])
        
        remainder_cols = columns_list[:-1].copy()
        rows_copy = rows[rows[columns_list[-1]]== label].copy()
        while R.accuracy< accu_thresh:
            
            for col in remainder_cols:
               
                values = rows_copy[col].unique()
                
                for val in values:
                    a = R.attributes.copy()
                    a.append(col)
                    if len(a)>2:
                        print
                    b = R.value.copy()
                    b.append(val)
                    new_R = Rule(attributes = a, value = b, label=label)
                    new_R.accuracy = score_func1(rows, new_R.attributes, new_R.value, label, columns_list[-1])
                    new_R.covered = score_func2(rows,new_R.attributes, new_R.value)
                    all_rules.append(new_R)
            
            
            best_rule = all_rules[0]
            
            for r in all_rules:
               
                if r.accuracy >= best_rule.accuracy and score_func2(rows, r.attributes, r.value) >= cover_thresh:
                    if r.accuracy == best_rule.accuracy: 
                        if score_func2(rows, r.attributes, r.value) < score_func2(rows, best_rule.attributes,best_rule.value):
                            continue
                    best_rule = r
            
            if best_rule.accuracy > R.accuracy:
                R = best_rule
            elif best_rule.accuracy==R.accuracy and score_func2(rows,best_rule.attributes, best_rule.value)>=score_func2(rows,R.attributes, R.value):
                R = best_rule
#             print(remainder_cols)
            
            if len(R.attributes)==0:
                pass
                
            elif R.attributes[-1] in remainder_cols:
                remainder_cols.remove(R.attributes[-1])
                
            if len(remainder_cols)==0:
                break
            
    best_rule = all_rules[0]
    for r in all_rules:
        if r.accuracy >= best_rule.accuracy and score_func2(rows,best_rule.attributes, best_rule.value) >= cover_thresh:
            if r.accuracy == best_rule.accuracy:
                if score_func2(rows,best_rule.attributes, best_rule.value) < score_func2(rows,best_rule.attributes, best_rule.value):
                    continue
            best_rule = r    
    if best_rule.accuracy != None:
        return best_rule

    else:
        print("threshold is too high or no available rules left")
        return None

In [None]:
data_file = "/Users/elaine/Desktop/ML2020labs/covid_categorical_good.csv"
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")

columns_list = data.columns.to_numpy().tolist()

def recursive_rules(rows, generate_rule):
    while len(rows) != 0:
        one_rule = generate_rule(rows,columns_list, score_func1=cal_accuracy, score_func2=coverage, accu_thresh=0.8, cover_thresh=5)
        if one_rule == None:
            break
        else:
            print(one_rule.accuracy,one_rule.label)
            rows = split(data, one_rule.attributes,one_rule.value)
            rows = data[~data.index.isin(rows.index)]


In [None]:
recursive_rules(data, generate_rule)