In [1]:
# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
np.random.seed(42)

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import Pipeline


#Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score,roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# import matplotlib.pyplot as plt # Import matplotlib for data visualisation
# import seaborn as sns # Statistical data visualization
import plotly.express as px
import plotly.graph_objects as go
# from plotly.subplots import make_subplots


In [16]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.14.1-py2.py3-none-any.whl (15.3 MB)
     ---------------------------------------- 15.3/15.3 MB 1.9 MB/s eta 0:00:00
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.14.1 tenacity-8.2.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
catboost 0.26.1 requires graphviz, which is not installed.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
np.logspace(-4, 4, 20)

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

In [2]:
new_rfc_feature_df = pd.read_csv('data/new_rfc_removed_feature_ncr.csv')
new_rfc_label_df = pd.read_csv('data/new_rfc_removed_label_ncr.csv')

In [3]:
# Combine label column with rfc feature columns 
after_resampling_df = new_rfc_feature_df.assign(HeartDisease= new_rfc_label_df)
after_resampling_df

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer,HeartDisease
0,28.87,1,0,0,6.0,1,0,77.0,0,0,1,0,0,0,1
1,34.30,1,0,0,30.0,1,1,62.0,2,0,0,1,0,0,1
2,29.18,0,0,0,1.0,0,0,52.0,0,1,3,0,0,0,0
3,26.17,1,0,0,0.0,0,0,47.0,0,1,3,0,0,0,0
4,32.98,1,0,1,10.0,1,1,77.0,2,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118281,23.38,0,0,0,0.0,0,0,62.0,0,1,4,0,0,0,0
118282,22.22,0,0,0,0.0,0,0,21.0,0,1,4,0,0,0,0
118283,27.41,1,0,0,7.0,1,1,62.0,2,0,1,1,0,0,1
118284,29.84,1,0,0,0.0,0,1,37.0,0,1,3,1,0,0,0


### Removing Outliers


In [18]:
fig = px.box(after_resampling_df['BMI'], y="BMI")
fig.show()

In [19]:
# The change outlier values in BMI to median values (26.63)
after_resampling_df.loc[(after_resampling_df.BMI > 40.79), 'BMI'] = 26.57

after_resampling_df.loc[(after_resampling_df.BMI < 12.91), 'BMI'] = 26.57



In [20]:
fig = px.box(after_resampling_df['BMI'], y="BMI")
fig.show()

### Train-Test split dan latih model setelah imbalance data dan menangani outlier

In [4]:
x = after_resampling_df.drop(columns=['HeartDisease'],axis=1)
y = after_resampling_df['HeartDisease']

In [5]:
# train_test split using Ncr technique 50% on resampling data
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
#Scale the feature for Train-Test
np.random.seed(42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [32]:
# define model
lg2 = LogisticRegression(random_state=42, class_weight={0:0.25, 1:0.75},C=0.08858667904100823, solver='lbfgs')
# fit it
lg2.fit(X_train_sc,y_train)
# test
y_pred = lg2.predict(X_test_sc)
train_pred = lg2.predict(X_train_sc)
y_pred_proba = lg2.predict_proba(X_test_sc)
y_pred_pos_proba = y_pred_proba[:, 1]

# performance
print(f'Training Accuracy Score: {accuracy_score(y_train,train_pred)}')
print(f'Testing Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred_pos_proba)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
# print(f'Clasification Report : \n {classification_report(y_test,y_pred)}')

Training Accuracy Score: 0.9526355835482099
Testing Accuracy Score: 0.9523628370952744
Confusion Matrix: 
[[17490   754]
 [  373  5041]]
Area Under Curve: 0.9806147713460527
Recall score: 0.9311045437753971


In [6]:
np.random.seed(42)

# Split into train and valid set
X_train_new, X_val, y_train_new, y_val = train_test_split(X_train ,#independent variable
                                                   y_train ,#dependent variable
                                                   test_size=0.2)

### Normalization

In [7]:
#Scale the feature for Train-Test
np.random.seed(42)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = StandardScaler()
X_train_new = sc.fit_transform(X_train_new)
X_val = sc.transform(X_val)

In [27]:
param_grid = [    
    {'penalty' : ['l2', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','sag'],
    'max_iter' : [100, 1000,2500, 5000],
    'class_weight' : [0.3, 0.7]
    }
]

# Setup Random Seed
np.random.seed(42)

gs_log_reg = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
gs_log_reg.fit(X_train_new,y_train_new)

gs_log_reg.best_estimator_

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


LogisticRegression(C=0.08858667904100823, class_weight=0.3)

In [28]:
gs_log_reg.best_params_

{'C': 0.08858667904100823,
 'class_weight': 0.3,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'lbfgs'}

> ### Evaluate Model


In [22]:
# Melatih model dan evaluasi menggunakan train-validation split
lg_val = LogisticRegression(random_state=42, class_weight={0:0.3, 1:0.7},C=0.23357214690901212, solver='sag')
# fit it
lg_val.fit(X_train_new,y_train_new)
# test
y_pred = lg_val.predict(X_val)
train_pred = lg_val.predict(X_train_new)
y_pred_proba = lg_val.predict_proba(X_val)
y_pred_pos_proba = y_pred_proba[:, 1]

# performance
print(f'Training Accuracy Score: {accuracy_score(y_train_new,train_pred)}')
print(f'Testing Accuracy Score: {accuracy_score(y_val,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_val, y_pred_pos_proba)}')
print(f'Recall score: {recall_score(y_val,y_pred)}')

Training Accuracy Score: 0.9577554093683126
Testing Accuracy Score: 0.9594209024622213
Confusion Matrix: 
[[14064   458]
 [  310  4094]]
Area Under Curve: 0.981926447904967
Recall score: 0.9296094459582198


In [25]:
# Melatih model dan evaluasi menggunakan train-test split
lg_test = LogisticRegression(random_state=42, class_weight={0:0.3, 1:0.7},C=0.23357214690901212, solver='sag')
# fit it
lg_test.fit(X_train_sc,y_train)
# test
y_pred = lg_test.predict(X_test_sc)
train_pred = lg_test.predict(X_train_sc)

# performance
print(f'Training Accuracy Score: {accuracy_score(y_train,train_pred)}')
print(f'Testing Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')
# print(f'Clasification Report : \n {classification_report(y_test,y_pred)}')

Training Accuracy Score: 0.9578137549139789
Testing Accuracy Score: 0.9568856200862288
Confusion Matrix: 
[[17647   597]
 [  423  4991]]
Area Under Curve: 0.9445730704426398
Recall score: 0.9218692279275951


## Save the Model

In [None]:
import pickle
data = lg_test
with open('heart_model.pkl', 'wb') as file:
    pickle.dump(data, file)

In [3]:
import pickle
with open('heart_model.pkl', 'rb') as file:
    data = pickle.load(file)

lg_loaded = data

In [32]:
y_pred = lg_loaded.predict(X_test_sc)
train_pred = lg_loaded.predict(X_train_sc)
y_score = lg_loaded.decision_function(X_test_sc)

# performance
print(f'Training Accuracy Score: {accuracy_score(y_train,train_pred)}')
print(f'Testing Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_score)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Training Accuracy Score: 0.9582998689605614
Testing Accuracy Score: 0.957646462084707
Confusion Matrix: 
[[17663   581]
 [  421  4993]]
Area Under Curve: 0.9806135361908965
Recall score: 0.9222386405615072


In [16]:
lg_loaded.predict_proba(X_test_sc)[:,1]

array([0.00189963, 0.02466223, 0.52693096, ..., 0.24609833, 0.0021513 ,
       0.99959355])

In [31]:
lg_loaded.decision_function(X_test_sc)

array([-6.2641938 , -3.67751085,  0.10782818, ..., -1.11953076,
       -6.13952927,  7.80764302])

In [24]:
X_test_sc[:10,:]

array([[ 1.70379997, -0.72350319, -0.31905943, -0.21246122, -0.19544667,
        -0.3401067 , -0.97058913, -1.00101399, -0.33885959,  0.49152097,
         0.13136667, -0.4103725 , -0.2029382 , -0.28077867],
       [ 0.16852609,  1.38216391, -0.31905943, -0.21246122, -0.34490867,
        -0.3401067 ,  1.03030208, -0.72569976, -0.33885959,  0.49152097,
         0.13136667, -0.4103725 , -0.2029382 , -0.28077867],
       [-0.69095004, -0.72350319, -0.31905943, -0.21246122, -0.34490867,
        -0.3401067 , -0.97058913,  1.91731681, -0.33885959,  0.49152097,
         1.07019043, -0.4103725 , -0.2029382 ,  3.56152412],
       [ 1.25846658,  1.38216391, -0.31905943, -0.21246122,  0.40240135,
        -0.3401067 , -0.97058913,  0.65087137,  2.85763201,  0.49152097,
        -1.74628085, -0.4103725 , -0.2029382 , -0.28077867],
       [ 0.4284483 , -0.72350319, -0.31905943, -0.21246122, -0.34490867,
        -0.3401067 , -0.97058913, -0.45038554, -0.33885959,  0.49152097,
         0.13136667,  2.43