In [1]:
import pandas as pd
import numpy as np
import json
import copy
from sklearn.metrics import precision_recall_fscore_support

from google.colab import drive
drive.mount("/content/drive")

# Replace with correct path
path = "/content/drive/MyDrive/CSC 466 Project/stroke.csv"

df = pd.read_csv(path)
df.dropna(inplace=True)
df.drop(columns=['id', 'work_type'], inplace=True)
df = df[df['smoking_status'] != 'Unknown']
df.reset_index(inplace=True, drop=True)
df.head()

Mounted at /content/drive


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Urban,186.21,29.0,formerly smoked,1


In [2]:
df_binned = df.copy()

df_binned['age'] = pd.cut(df_binned['age'],bins=20).astype(str)
df_binned['avg_glucose_level'] = pd.cut(df_binned['avg_glucose_level'],bins=20).astype(str)
df_binned['bmi'] = pd.cut(df_binned['bmi'],bins=20).astype(str)


df_numeric = df.copy()

for col in df_numeric.columns:
    if df_numeric[col].dtype != 'object':
        continue
    if col == "smoking_status":
      mapping = {"never smoked": 0, "formerly smoked": 1, "smokes": 2}
    else:
      mapping = {category: i for i, category in enumerate(df_numeric[col].unique())}
    df_numeric[col] = df_numeric[col].map(mapping)

## Binned Tree

In [3]:
X = df_binned.drop(columns=['stroke'])
y = df_binned['stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def entropy(y):
    vals = y.value_counts(normalize=True).values
    e = 0
    for val in vals:
        e -= val*np.log2(val)
    return e


def gain(y,x):
    g = 0
    vals = x.value_counts(normalize=True)
    for f, p in vals.items():
        subset = y[x == f]
        g += p*entropy(subset)
    return entropy(y) - g


def gain_ratio(y,x):
    g = gain(y, x)
    return g/entropy(y)


def select_split(X,y):
    col_gr = {}
    for column in X.columns:
        col_gr[column] = gain_ratio(y, X[column])
    col, gr = max(col_gr.items(), key=lambda x: x[1])
    return col,gr


def make_tree(X,y, min_split_count=0):
    tree = {}
    # Your solution here
    if len(set(y)) == 1:
        return int(y.iloc[0])
    elif (X.shape[1] <= 1) or (X.shape[0] < min_split_count):
        return int(y.value_counts().index[0])
    else:
        f = select_split(X, y)[0]
        possible_vals = pd.unique(X[f].values)
        temp = {}
        for val in possible_vals:
            X_new = X.loc[X[f] == val].drop(f, axis=1)
            y_new = y.loc[X[f] == val]
            temp[val] = make_tree(X_new, y_new, min_split_count)
        tree[f] = temp
        return tree


def print_tree(tree):
    mytree = copy.deepcopy(tree)
    def fix_keys(tree):
        if type(tree) != dict:
            return tree #int(tree)
        new_tree = {}
        for key in list(tree.keys()):
            if type(key) == np.int64:
                new_tree[int(key)] = tree[key]
            else:
                new_tree[key] = tree[key]
        for key in new_tree.keys():
            new_tree[key] = fix_keys(new_tree[key])
        return new_tree
    mytree = fix_keys(mytree)
    print(json.dumps(mytree, indent=4, sort_keys=True))


def generate_rules(tree):
    rules = []
    if isinstance(tree, int):
        rules.append([tree])
    else:
        key = next(iter(tree))
        for instance, subtree in tree[key].items():
            sub_rules = generate_rules(subtree)
            for rule in sub_rules:
                rules.append([(key, instance)] + rule)
    return rules


def make_prediction(rules,x,default):
    for rule in rules:
        matches = True
        for element in rule:
            if isinstance(element, int):
                return element
            condition = element[0]
            if "<" in condition: # numeric
                col, val = condition.split("<")
                boolean = eval(element[1])
                if (x[col] < float(val)) != boolean:
                    matches = False
                    break
            else: # bins
                col, val = element
                if str(x[col]) != str(val):
                    matches = False
                    break
        if not matches:
            continue
    return default

In [5]:
tree = make_tree(X_train, y_train, min_split_count=10)
print_tree(tree)

{
    "age": {
        "(13.6, 17.2]": 0,
        "(17.2, 20.8]": 0,
        "(20.8, 24.4]": 0,
        "(24.4, 28.0]": 0,
        "(28.0, 31.6]": 0,
        "(31.6, 35.2]": {
            "avg_glucose_level": {
                "(109.275, 120.106]": 0,
                "(120.106, 130.937]": 0,
                "(130.937, 141.768]": 0,
                "(141.768, 152.599]": 0,
                "(152.599, 163.43]": 0,
                "(174.261, 185.092]": 0,
                "(195.923, 206.754]": 0,
                "(228.416, 239.247]": 0,
                "(239.247, 250.078]": 0,
                "(54.903, 65.951]": 0,
                "(65.951, 76.782]": {
                    "bmi": {
                        "(19.55, 23.575]": 0,
                        "(23.575, 27.6]": 0,
                        "(27.6, 31.625]": 0,
                        "(31.625, 35.65]": 0,
                        "(35.65, 39.675]": 0,
                        "(39.675, 43.7]": 0,
                        "(43.7, 47.725]": 

In [6]:
rules = generate_rules(tree)

default = 0
preds = X_test.apply(lambda x: make_prediction(rules, x, default), axis=1)
(preds == y_test).mean()

0.9052478134110787

In [7]:
precision, recall, f1_score, support = precision_recall_fscore_support(y_test, preds)
metrics = pd.DataFrame({"precision": precision, "recall": recall, "f1_score": f1_score, "support": support})
metrics


Unnamed: 0,precision,recall,f1_score,support
0,0.939117,0.961059,0.949962,642
1,0.137931,0.090909,0.109589,44


In [8]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, preds)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,617,25
1,40,4


## Numeric

In [9]:
X = df_numeric.drop(columns=['stroke'])
y = df_numeric['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
def split_col(x, y):
    counts = x.value_counts().sort_index()
    labels = list(counts.keys())
    gr = -1
    c = None
    for i in range(len(labels)-1):
        split = (labels[i]+labels[i+1])/2
        x_bool = x.apply(lambda x: True if x < split else False)
        new_gr = gain_ratio(y, x_bool)
        if new_gr > gr:
            gr = new_gr
            c = split
    return gr, c


def select_split2(X,y):
    col_gr = {}
    for column in X.columns:
        col_gr[column] = split_col(X[column], y)
    newname, gr = max(col_gr.items(), key=lambda x: x[1][0])
    return newname, gr[1], gr[0]


def make_tree2(X,y,min_split_count=5):
    tree = {}
    if len(set(y)) == 1:
        return int(y.iloc[0])
    elif (X.shape[1] == 0) or (X.shape[0] < min_split_count):
        return int(y.value_counts().index[0])
    else:
        split_vals = select_split2(X, y)
        column = split_vals[0]
        split_point = split_vals[1]
        gr = split_vals[2]
        if not split_point:# or gr < 0.001:
            return int(y.value_counts().index[0])
        temp = {}
        # Greater than
        X_new = X.loc[X[column] > split_point].drop(column, axis=1)
        y_new = y.loc[X[column] > split_point]
        temp["False"] = make_tree2(X_new, y_new, min_split_count)
        # Less than
        X_new = X.loc[X[column] < split_point].drop(column, axis=1)
        y_new = y.loc[X[column] < split_point]
        temp["True"] = make_tree2(X_new, y_new, min_split_count)
        # Put into tree
        tree[f"{column}<{split_point:.2f}"] = temp
        return tree

In [11]:
tree = make_tree2(X_train, y_train, min_split_count=5)
# print_tree(tree)

In [12]:
rules = generate_rules(tree)

default = 0
preds = X_test.apply(lambda x: make_prediction(rules, x, default), axis=1)
(preds == y_test).mean()

0.9285714285714286

In [13]:
precision, recall, f1_score, support = precision_recall_fscore_support(y_test, preds)
metrics = pd.DataFrame({"precision": precision, "recall": recall, "f1_score": f1_score, "support": support})
metrics

Unnamed: 0,precision,recall,f1_score,support
0,0.937962,0.989097,0.962851,642
1,0.222222,0.045455,0.075472,44


In [14]:
cm = confusion_matrix(y_test, preds)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,635,7
1,42,2


## SciKit Learn Model

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import LabelEncoder

X = df_numeric.drop(columns=['stroke'])
y = df_numeric['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf = DecisionTreeClassifier(min_samples_split=5)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# accuracy = accuracy_score(y_test, pred)
# print("Accuracy:", accuracy)

precision, recall, f1_score, support = precision_recall_fscore_support(y_test, preds)
metrics = pd.DataFrame({"precision": precision, "recall": recall, "f1_score": f1_score, "support": support})
metrics

Unnamed: 0,precision,recall,f1_score,support
0,0.945736,0.950156,0.947941,642
1,0.219512,0.204545,0.211765,44


In [16]:
from sklearn.tree import export_text

print(export_text(clf, feature_names=list(X.columns)))

|--- age <= 67.50
|   |--- age <= 44.50
|   |   |--- avg_glucose_level <= 58.11
|   |   |   |--- avg_glucose_level <= 58.05
|   |   |   |   |--- class: 0
|   |   |   |--- avg_glucose_level >  58.05
|   |   |   |   |--- class: 1
|   |   |--- avg_glucose_level >  58.11
|   |   |   |--- avg_glucose_level <= 83.25
|   |   |   |   |--- avg_glucose_level <= 83.19
|   |   |   |   |   |--- avg_glucose_level <= 82.26
|   |   |   |   |   |   |--- smoking_status <= 1.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- smoking_status >  1.50
|   |   |   |   |   |   |   |--- avg_glucose_level <= 75.81
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- avg_glucose_level >  75.81
|   |   |   |   |   |   |   |   |--- avg_glucose_level <= 76.23
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- avg_glucose_level >  76.23
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- avg_glucose_level >  82

In [17]:
cm = confusion_matrix(y_test, preds)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,610,32
1,35,9
