# MACHINE LEARNING PROJECT NOTEBOOK

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv(r'C:\Users\dell uer\Downloads\heart_failure_clinical_records_dataset.csv')
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [5]:
X = df.drop('DEATH_EVENT', axis = 1)
y = df['DEATH_EVENT']
print(X.shape)
X.head(2)

(299, 12)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6


In [17]:
from imblearn.over_sampling import SMOTE

In [27]:
def get_data(smote=False, pca_val=False, split=0.2):

    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=split, random_state=1)
    
    # MinMax Scaling
    stdscl = MinMaxScaler()
    cols = X.columns
    X_train = pd.DataFrame(stdscl.fit_transform(X_train), columns = cols)
    X_test = pd.DataFrame(stdscl.transform(X_test), columns = cols)
    
    # Applying PCA
    if(pca_val):
        pca = PCA(pca_val)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        columns = []
        for i in range(X_train.shape[1]):
            columns.append("col" + str(i))
        X_train = pd.DataFrame(X_train, columns = columns)
        X_test = pd.DataFrame(X_test, columns = columns)
        
    # Reset Index due to Shuffling of Data
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    
    return (X_train, X_test, Y_train, Y_test)

In [28]:
def score_pred(Y_test, Y_pred, model_name = "this model"):

    # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)

    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)

    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)

In [43]:
from statistics import mean
# logistic regression classifier
a_accuracy=[]
a_precision=[]
a_recall=[]
a_f1=[]
model_name='Logistic regression'
for _ in range(100):
    X_train, X_test, Y_train, Y_test = get_data(pca_val=0.99)
    lr = LogisticRegression()
    lr.fit(X_train, Y_train)
    Y_pred = lr.predict(X_test)
    score_pred(Y_test, Y_pred, model_name = "Logistic Regression ")
            # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)
    a_accuracy.append(accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)
    a_precision.append(precision)
    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)
    a_recall.append(recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)
    a_f1.append(f1)
print()
print("mean Accuracy of logistic regression after 100 iteration is : ",mean(a_accuracy))
print("mean Precision of logistic regression after 100 iteration is : ",mean(a_precision))
print("mean Recall of logistic regression after 100 iteration is : ",mean(a_recall))
print("mean F1 of logistic regression after 100 iteration is : ",mean(a_f1))


[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.7142

[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.7142

Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866

Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866

Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.714286
Recall of Logistic regression is : 0.714286
F1 score of Logistic regression is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Linear Regression  is : 0.866667
Precision of Linear Regression  is : 0.714286
Recall of Linear Regression  is : 0.714286
F1 score of Linear Regression  is : 0.714286
[[42  4]
 [ 4 10]]

Accuracy of Logistic regression is : 0.866667
Precision of Logistic regression is : 0.71

In [30]:
X_train

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10
0,-0.699120,0.576435,-0.486316,0.246639,0.301535,-0.104841,-0.273373,0.064058,-0.002387,-0.075354,-0.034069
1,-0.431487,-0.238616,-0.091236,-0.606576,-0.383893,-0.298759,-0.011300,0.063915,-0.055726,-0.085665,0.003619
2,0.240919,-0.027993,-0.627269,0.489058,-0.533817,-0.194956,0.164152,-0.071751,0.029414,0.072783,-0.056063
3,0.708957,0.811209,0.369532,-0.243322,0.243213,-0.074632,-0.117700,-0.110539,0.195609,-0.074661,0.007900
4,0.631745,-0.741575,-0.170124,-0.163096,0.204221,0.499359,-0.045196,0.024693,0.014526,-0.066447,0.251378
...,...,...,...,...,...,...,...,...,...,...,...
234,-0.766325,0.082100,0.826481,0.105546,0.303792,-0.280297,0.013058,-0.198404,-0.129234,0.069512,0.182480
235,-0.083337,0.231655,0.237343,1.167748,0.213358,-0.553863,-0.038968,-0.023862,0.111647,-0.058190,0.100607
236,-0.994068,-0.124313,0.026073,-0.120036,0.241709,0.445465,0.273391,-0.098813,0.388451,0.475297,-0.261819
237,0.076993,0.830332,0.389348,0.047188,-0.446264,-0.397500,0.315641,0.104759,0.017233,-0.017815,0.011015


In [42]:
from statistics import mean
# Random Forest Classifier
a_accuracy=[]
a_precision=[]
a_recall=[]
a_f1=[]
model_name='random forest'
for i in range(100):
    X_train, X_test, Y_train, Y_test = get_data(pca_val = 0.95)
    rf = RandomForestClassifier(max_depth=9, n_estimators=50, min_samples_leaf=1, min_samples_split=3)
    rf.fit(X_train, Y_train)
    Y_pred = rf.predict(X_test)
        # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)
    a_accuracy.append(accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)
    a_precision.append(precision)
    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)
    a_recall.append(recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)
    a_f1.append(f1)
print()
print("mean Accuracy of random forest after 100 iteration is : ",mean(a_accuracy))
print("mean Precision of random forest after 100 iteration is : ",mean(a_precision))
print("mean Recall of random forest after 100 iteration is : ",mean(a_recall))
print("mean F1 of random forest after 100 iteration is : ",mean(a_f1))

[[44  2]
 [ 5  9]]

Accuracy of random forest is : 0.883333
Precision of random forest is : 0.818182
Recall of random forest is : 0.642857
F1 score of random forest is : 0.720000
[[45  1]
 [ 7  7]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.875000
Recall of random forest is : 0.500000
F1 score of random forest is : 0.636364
[[44  2]
 [ 3 11]]

Accuracy of random forest is : 0.916667
Precision of random forest is : 0.846154
Recall of random forest is : 0.785714
F1 score of random forest is : 0.814815
[[44  2]
 [ 5  9]]

Accuracy of random forest is : 0.883333
Precision of random forest is : 0.818182
Recall of random forest is : 0.642857
F1 score of random forest is : 0.720000
[[45  1]
 [ 6  8]]

Accuracy of random forest is : 0.883333
Precision of random forest is : 0.888889
Recall of random forest is : 0.571429
F1 score of random forest is : 0.695652
[[44  2]
 [ 7  7]]

Accuracy of random forest is : 0.850000
Precision of random forest is : 0.777778
Reca

[[42  4]
 [ 5  9]]

Accuracy of random forest is : 0.850000
Precision of random forest is : 0.692308
Recall of random forest is : 0.642857
F1 score of random forest is : 0.666667
[[44  2]
 [ 6  8]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.800000
Recall of random forest is : 0.571429
F1 score of random forest is : 0.666667
[[42  4]
 [ 6  8]]

Accuracy of random forest is : 0.833333
Precision of random forest is : 0.666667
Recall of random forest is : 0.571429
F1 score of random forest is : 0.615385
[[43  3]
 [ 5  9]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.750000
Recall of random forest is : 0.642857
F1 score of random forest is : 0.692308
[[44  2]
 [ 6  8]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.800000
Recall of random forest is : 0.571429
F1 score of random forest is : 0.666667
[[44  2]
 [ 6  8]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.800000
Reca

[[44  2]
 [ 7  7]]

Accuracy of random forest is : 0.850000
Precision of random forest is : 0.777778
Recall of random forest is : 0.500000
F1 score of random forest is : 0.608696
[[44  2]
 [ 6  8]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.800000
Recall of random forest is : 0.571429
F1 score of random forest is : 0.666667
[[43  3]
 [ 6  8]]

Accuracy of random forest is : 0.850000
Precision of random forest is : 0.727273
Recall of random forest is : 0.571429
F1 score of random forest is : 0.640000
[[44  2]
 [ 5  9]]

Accuracy of random forest is : 0.883333
Precision of random forest is : 0.818182
Recall of random forest is : 0.642857
F1 score of random forest is : 0.720000
[[43  3]
 [ 6  8]]

Accuracy of random forest is : 0.850000
Precision of random forest is : 0.727273
Recall of random forest is : 0.571429
F1 score of random forest is : 0.640000
[[43  3]
 [ 5  9]]

Accuracy of random forest is : 0.866667
Precision of random forest is : 0.750000
Reca

In [44]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

In [50]:
from statistics import mean
# Gradient boosting Classifier
a_accuracy=[]
a_precision=[]
a_recall=[]
a_f1=[]
model_name='Gradient boosting'
for _ in range(100):
    X_train, X_test, Y_train, Y_test = get_data(pca_val = 0.95)
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
         max_depth=1)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
        # confusion matrix
    matrix = confusion_matrix(Y_test, Y_pred)
    print(matrix)

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('\nAccuracy of ' + model_name + ' is : %f' % accuracy)
    a_accuracy.append(accuracy)

    # precision tp / (tp + fp)
    precision = precision_score(Y_test, Y_pred)
    print('Precision of ' + model_name + ' is : %f' % precision)
    a_precision.append(precision)
    # recall: tp / (tp + fn)
    recall = recall_score(Y_test, Y_pred)
    print('Recall of ' + model_name + ' is : %f' % recall)
    a_recall.append(recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, Y_pred)
    print('F1 score of ' + model_name + ' is : %f' % f1)
    a_f1.append(f1)
print()
print("mean Accuracy of Gradient boosting after 100 iteration is : ",mean(a_accuracy))
print("mean Precision of Gradient boosting after 100 iteration is : ",mean(a_precision))
print("mean Recall of Gradient boosting after 100 iteration is : ",mean(a_recall))
print("mean F1 of Gradient boosting after 100 iteration is : ",mean(a_f1))

[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accur

[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accur

[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accuracy of Gradient boosting is : 0.900000
Precision of Gradient boosting is : 0.833333
Recall of Gradient boosting is : 0.714286
F1 score of Gradient boosting is : 0.769231
[[44  2]
 [ 4 10]]

Accur