<a href="https://colab.research.google.com/github/buczekEngineering/Stroke-Prediction-Application/blob/main/Class_Imbalance_Part3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Science Project Workflow

+ Solving Class Imbalance Problem 
+ Re-training the models 
+ Training the model using Ensamble Algorithms 
+ Training the model using Neural Networks 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from google.colab import drive

In [2]:
drive.mount("/content/drive")

Mounted at /content/drive


In [45]:
data = pd.read_csv("/content/drive/MyDrive/Stroke_Prediction/num_selected_data_stroke.csv")
data.columns

Index(['Unnamed: 0', 'age', 'avg_glucose_level', 'bmi', 'Residence_type',
       'hypertension', 'ever_married', 'heart_disease', 'stroke'],
      dtype='object')

In [46]:
data.drop(columns= 'Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,age,avg_glucose_level,bmi,Residence_type,hypertension,ever_married,heart_disease,stroke
0,80.0,68.56,26.2,1,1,1,0,1
1,39.0,62.02,23.7,1,0,1,0,0
2,13.0,78.38,38.7,0,0,0,0,0
3,49.0,61.57,37.9,0,0,1,0,0
4,58.0,96.01,33.8,0,0,1,0,0


In [48]:
data_len = len(data)
stroke_len = len(data[data["stroke"]==1])
heathy_len = data_len - stroke_len

In [49]:
import plotly.graph_objects as go

targets = ["stroke", "healthy"]
values = [stroke_len, heathy_len]

fig = go.Figure(data=[go.Pie(labels=targets,values=values, hole=.3)])
fig.show()

# SMOTE: Synthetic Minority Oversampling TEchnique

Algorithm is picking a random point from the under-represented class and computing k-nearest neighbours for its. The synthetic points are added between the choosen point and its neighbours.

In [50]:
features = [col for col in data.columns if col != "stroke"]
features = data[features]
labels = data["stroke"]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)

In [52]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio="minority", random_state=0)
X_sm, y_sm = smote.fit_sample(X_train, y_train)



Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



# SMOTE + TOMEK 

In [54]:
from imblearn.combine import SMOTETomek

tomek = SMOTETomek(random_state=0)
X_tom, y_tom = tomek.fit_resample(X_train, y_train)


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



# SMOTE + ENN 

In [55]:
from imblearn.combine import SMOTEENN 
enn = SMOTEENN(random_state=0)
X_enn, y_enn = enn.fit_resample(X_train, y_train)


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



# ADASYN

In [56]:
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=0)
X_ada, y_ada = ada.fit_resample(X_train, y_train)


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.



# MODEL AND HYPERPARAMETER SEARCH


In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [58]:
data = {
    "ADASYN" : [X_ada, y_ada],
    "SMOTE+ENN" :[X_enn, y_enn],
    "TOMEK" : [X_tom, y_tom],
    "SMOTE" : [X_sm, y_sm],
    "NO RESAMPLING" : [features, labels]
}

In [63]:
param_grid = {
    "logistic_regression" : {
        "model": LogisticRegression(),
        "parameters": {
            "penalty": ["l1", "l2"],
            "C": np.logspace(-4,4,20),
            "solver": ['liblinear']       
                      } 
                            },

    
    "random_forest" : {
        "model": RandomForestClassifier(),
        "parameters" : {
            "n_estimators":list(range(10, 101, 10)),
            "max_depth":list(range(6,32,2)),
            "criterion" : ["gini", "entropy"],
            "max_features": ["auto", "log2"],
            "min_samples_leaf" : [1,2,4],
            "min_samples_split" : [2,5,10,15,20,35,50,75,100]
                        }
                      }

}

In [64]:
def evaluate(model, resamplin_algo, X_test, y_test, best_model): 
  y_pred = best_model.predict(X_test)
  print(confusion_matrix(y_test, y_pred))
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  print("Precision: {}, Recall: {}, F1: {}".format(precision, recall, f1))

  return model, resamplin_algo, precision, recall, f1 

In [None]:
results = []
cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

for model, params in param_grid.items(): 
  for resampling_algo, transformed_data in data.items(): 
    search = GridSearchCV(params["model"], params["parameters"], cv=cv, scoring="roc_auc", n_jobs=-1)
    search.fit(transformed_data[0], transformed_data[1])
    print("*"*60)
    print("Evaluation of {} with resampling strategy: {}".format(model, resampling_algo))
    print(search.best_params_)
    best_model = search.best_estimator_
    results.append(evaluate(model, resampling_algo, X_test, y_test, best_model))

************************************************************
Evaluation of logistic_regression with resampling strategy: ADASYN
{'C': 1.623776739188721, 'penalty': 'l2', 'solver': 'liblinear'}
[[630 225]
 [  3  19]]
Precision: 0.0778688524590164, Recall: 0.8636363636363636, F1: 0.14285714285714285
************************************************************
Evaluation of logistic_regression with resampling strategy: SMOTE+ENN
{'C': 4.281332398719396, 'penalty': 'l2', 'solver': 'liblinear'}
[[602 253]
 [  2  20]]
Precision: 0.07326007326007326, Recall: 0.9090909090909091, F1: 0.13559322033898308
************************************************************
Evaluation of logistic_regression with resampling strategy: TOMEK
{'C': 4.281332398719396, 'penalty': 'l1', 'solver': 'liblinear'}
[[641 214]
 [  4  18]]
Precision: 0.07758620689655173, Recall: 0.8181818181818182, F1: 0.14173228346456695
************************************************************
Evaluation of logistic_regression with


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
final_results = pd.DataFrame(results, labels = ["model", "resamplin_algo", "precision", "recall", "f1"])
final_results.sort_values(by="precision", ascending=False)
print(final_results)

In [None]:
import joblib
def save_model(model):
  model_name = type(model).__name__
  save_file_name = f"{model_name}.pkl"
  joblib.dump(model, save_file_name)