In [None]:
# Complete attribute documentation:

# 3 age: age in years
# 4 sex: sex (1 = male; 0 = female)
# 9 cp: chest pain type
# -- Value 1: typical angina
# -- Value 2: atypical angina
# -- Value 3: non-anginal pain
# -- Value 4: asymptomatic
# 10 trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# 12 chol: serum cholestoral in mg/dl
# 16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# 19 restecg: resting electrocardiographic results
# -- Value 0: normal
# -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
# 32 thalach: maximum heart rate achieved
# 38 exang: exercise induced angina (1 = yes; 0 = no)
# 40 oldpeak = ST depression induced by exercise relative to rest
# 41 slope: the slope of the peak exercise ST segment
# -- Value 1: upsloping
# -- Value 2: flat
# -- Value 3: downsloping
# 44 ca: number of major vessels (0-3) colored by flourosopy
# 51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
# 58 num: diagnosis of heart disease (angiographic disease status)
# -- Value 0: < 50% diameter narrowing
# -- Value 1: > 50% diameter narrowing


In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data


--2022-04-24 15:13:09--  https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18461 (18K) [application/x-httpd-php]
Saving to: ‘processed.cleveland.data.6’


2022-04-24 15:13:09 (647 KB/s) - ‘processed.cleveland.data.6’ saved [18461/18461]



In [None]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.ensemble import RandomTreesEmbedding
sns.set_style("darkgrid")
sns.set_context(font_scale=1.0, rc={"lines.linewidth": 2.0})

In [None]:
dataset = pd.read_csv("/content/processed.cleveland.data")
dataset.columns = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","target"]

dataset = dataset.applymap(lambda x: None if x=='?' else x)
dataset.dropna(axis=0, inplace=True)
dataset.thal = dataset.thal.map(lambda x: (float(x)//1.0))
#dataset.chol = dataset.chol.map(lambda x: 0 if x<240 else 1)

x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1].map(lambda x:1 if x>1 else x)  

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, stratify=y,random_state=4)


In [None]:
!pip install category_encoders
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=["sex","cp", "fbs","restecg","exang","slope","ca","thal"])
x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)

cols = x_train.columns
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
col_sc =  ["age","trestbps","thalach","oldpeak", "chol"]
x_train.loc[:,col_sc] = sc.fit_transform(x_train.loc[:,col_sc])
x_test.loc[:,col_sc] = sc.transform(x_test.loc[:,col_sc])




# **Individual models**

In [None]:
# List of individual model
ind_model_list=pd.DataFrame(index=[], columns=['Acc', 'Precision','F-score', 'Sensitivity', 'Specificity'])
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity


In [None]:
# Get scores
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
def get_scores(y_true, y_pred):
    
    acc=accuracy_score(y_true, y_pred)
    prec=precision_score(y_true, y_pred)
    sens=recall_score(y_true, y_pred)
    f_sc=f1_score(y_true, y_pred)
    spec=recall_score(y_true, y_pred, pos_label=0)

    return [acc, prec, sens, f_sc, spec]

In [None]:
# Logistics Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=100)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)

# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=50,max_depth=3, min_samples_leaf=4)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375


In [None]:
from sklearn.svm import SVC

model = SVC(probability=True)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375


In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=4)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875


In [None]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125


In [None]:
%%script false
from sklearn.linear_model import TweedieRegressor

model = TweedieRegressor(power=0)
model._estimator_type = 'classifier'
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

In [None]:
# Import statements
import tensorflow as tf
from tensorflow import keras

In [None]:
# Build a model
n = 10
model = keras.Sequential(
    [
        keras.layers.Dense( n, activation="relu", input_shape=(x_train.shape[-1],)),
        keras.layers.Dense( n, activation="relu"),
        keras.layers.Dropout(rate=0.25),
        keras.layers.Dense( n, activation="relu"),
        keras.layers.Dropout(rate=0.25),
        keras.layers.Dense( 2, activation="softmax",),
    ]
)
#model_A.summary()

In [None]:
# Compile the model
model.compile( optimizer='adam', loss="sparse_categorical_crossentropy", metrics='accuracy' )
epochs = 25
history = model.fit( x_train, y_train, batch_size=32, epochs=epochs, validation_data=(x_test, y_test))

y_pred=np.argmax(model.predict(x_test), axis=-1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
# add scores into the DF
ind_model_list.loc['ANN',:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125
ANN,0.833333,0.846154,0.785714,0.814815,0.875


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=10)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125
ANN,0.833333,0.846154,0.785714,0.814815,0.875
GradientBoostingClassifier,0.8,0.785714,0.785714,0.785714,0.8125


In [None]:
# SkLearn Voting Classifer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

m1 = SVC(probability=True)
m2 = LogisticRegression(max_iter=100)#, penalty='l1', C=0.2, solver='liblinear')
m3 = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_leaf=4)

voting_clf = VotingClassifier(
    estimators=[('m1', m1),('m2', m2),('m3',m3)],voting='soft'
    ,weights=[0,5,5])

voting_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score
from sklearn import metrics
for clf in (m1,m2, m3, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_train=clf.predict(x_train)
    
    print(clf.__class__.__name__, accuracy_score(y_train, y_pred_train),  accuracy_score(y_test, y_pred))
    
    # add scores into the DF
    if clf.__class__.__name__=='VotingClassifier':
        ind_model_list.loc[clf.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

SVC 0.923728813559322 0.8333333333333334
LogisticRegression 0.885593220338983 0.8
RandomForestClassifier 0.9152542372881356 0.8166666666666667
VotingClassifier 0.8940677966101694 0.8166666666666667


Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125
ANN,0.833333,0.846154,0.785714,0.814815,0.875
GradientBoostingClassifier,0.8,0.785714,0.785714,0.785714,0.8125
VotingClassifier,0.816667,0.84,0.75,0.792453,0.875


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=100, random_state=0, min_samples_leaf=12)
model.fit(x_train, y_train)

model.score(x_test, y_test), model.score(x_train, y_train)
y_pred=model.predict(x_test)
# add scores into the DF
ind_model_list.loc[model.__class__.__name__,:]=get_scores(y_test, y_pred)
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125
ANN,0.833333,0.846154,0.785714,0.814815,0.875
GradientBoostingClassifier,0.8,0.785714,0.785714,0.785714,0.8125
VotingClassifier,0.816667,0.84,0.75,0.792453,0.875
ExtraTreesClassifier,0.8,0.766667,0.821429,0.793103,0.78125


# **HRFLM**

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=2)
model.fit(x_train, y_train)


DecisionTreeClassifier(max_depth=2)

In [None]:
train_id_leaf=model.apply(x_train)
test_id_leaf=model.apply(x_test)

unique_leaf=list(set(train_id_leaf))
unique_leaf

[2, 3, 5, 6]

In [None]:
# Test each split on different model - LR, RF, SVM
model_lr = LogisticRegression(max_iter=100)
model_rf = RandomForestClassifier(n_estimators=100,max_depth=4, min_samples_leaf=4)
model_svc = SVC(probability=True)

model_list=[model_lr, model_rf, model_svc]

score_df=pd.DataFrame(index=unique_leaf, columns=['LogisticRegression', 'RandomForestClassifier', 'SVC'])
score_df

Unnamed: 0,LogisticRegression,RandomForestClassifier,SVC
2,,,
3,,,
5,,,
6,,,


In [None]:
for model in model_list:
    sample_cnt_list=[]
    for id in unique_leaf:
        x_train_split=x_train[train_id_leaf==id]
        y_train_split=y_train[train_id_leaf==id]
        x_test_split=x_test[test_id_leaf==id]
        y_test_split=y_test[test_id_leaf==id]
        
        print(model, id, x_train_split.shape, x_test_split.shape)
        model.fit(x_train_split, y_train_split)
        if x_test_split.shape[0]>0:
            y_pred_split=model.predict(x_test_split)
            score=get_scores(y_test_split, y_pred_split)
            #score=model.score(x_test_split, y_test_split)
            score_df.loc[id,model.__class__.__name__]=score
        sample_cnt_list.append(x_test_split.shape[0])

Unnamed: 0,LogisticRegression,RandomForestClassifier,SVC
2,"[0.75, 0.5, 1.0, 0.6666666666666666, 0.6666666...","[0.75, 0.5, 1.0, 0.6666666666666666, 0.6666666...","[0.75, 0.5, 1.0, 0.6666666666666666, 0.6666666..."
3,"[0.8095238095238095, 1.0, 0.2, 0.3333333333333...","[0.7619047619047619, 0.0, 0.0, 0.0, 1.0]","[0.7619047619047619, 0.0, 0.0, 0.0, 1.0]"
5,"[0.8333333333333334, 0.8333333333333334, 1.0, ...","[0.8333333333333334, 0.8333333333333334, 1.0, ...","[0.8333333333333334, 0.8333333333333334, 1.0, ..."
6,"[1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 1.0, 1.0, 1.0, 1.0]","[1.0, 1.0, 1.0, 1.0, 1.0]"


In [None]:
print(score_df.iloc[2,0]), print(score_df.iloc[2,1]), print(score_df.iloc[2,2])

[0.8333333333333334, 0.8333333333333334, 1.0, 0.9090909090909091, 0.0]
[0.8333333333333334, 0.8333333333333334, 1.0, 0.9090909090909091, 0.0]
[0.8333333333333334, 0.8333333333333334, 1.0, 0.9090909090909091, 0.0]


(None, None, None)

In [None]:
score_list=[[0.75, 0.8095238095238095,0.8333333333333334, 1.0],[0.5, 1.0, 0.8333333333333334, 1.0], 
            [1.0, 0.2, 1.0, 1.0], [0.6666666666666666, 0.3333333333333, 0.9090909090909091, 1.0 ], 
            [0.6666666666666666, 1.0, 0.0, 1.0]]

In [None]:
score_HRFLM=np.array([a*b for score in score_list for a,b in zip(sample_cnt_list,score)]).reshape(5,4)

score_HRFLM=score_HRFLM.sum(axis=1)/60

In [None]:
# add scores into the DF
ind_model_list.loc['HRFLM',:]=score_HRFLM
ind_model_list

Unnamed: 0,Acc,Precision,F-score,Sensitivity,Specificity
LogisticRegression,0.8,0.833333,0.714286,0.769231,0.875
RandomForestClassifier,0.833333,0.821429,0.821429,0.821429,0.84375
SVC,0.833333,0.821429,0.821429,0.821429,0.84375
DecisionTreeClassifier,0.716667,0.689655,0.714286,0.701754,0.71875
BernoulliNB,0.816667,0.793103,0.821429,0.807018,0.8125
ANN,0.833333,0.846154,0.785714,0.814815,0.875
GradientBoostingClassifier,0.8,0.785714,0.785714,0.785714,0.8125
VotingClassifier,0.816667,0.84,0.75,0.792453,0.875
ExtraTreesClassifier,0.8,0.766667,0.821429,0.793103,0.78125
HRFLM,0.85,0.883333,0.72,0.694949,0.655556


# **Misc.**