# Brain Stroke Prediction
m13-ml-industrial-final

# Loading and import

In [44]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pandas_profiling import ProfileReport



import pickle


In [45]:
df = pd.read_csv('data/full_data.csv')
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
5,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
6,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
7,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
8,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
9,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


# Data Exploration

In [46]:
df.shape

(4981, 11)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [48]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


In [49]:
df.corr()['stroke']

age                  0.246478
hypertension         0.131965
heart_disease        0.134610
avg_glucose_level    0.133227
bmi                  0.056926
stroke               1.000000
Name: stroke, dtype: float64

In [50]:
num_features = ['age', 'hypertension', 'heart_disease', 'ever_married']

In [9]:
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

In [10]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [5]:
print(df['gender'].unique())
print(df['work_type'].unique())
print(df['Residence_type'].unique())
print(df['smoking_status'].unique())
print(df['ever_married'].unique())

['Male' 'Female']
['Private' 'Self-employed' 'Govt_job' 'children']
['Urban' 'Rural']
['formerly smoked' 'never smoked' 'smokes' 'Unknown']
['Yes' 'No']


In [51]:
df['ever_married'] = [ 0 if i !='Yes' else 1 for i in df['ever_married'] ]
df['gender'] = [0 if i != 'Female' else 1 for i in df['gender']]
df.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked,1
1,0,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked,1
2,1,49.0,0,0,1,Private,Urban,171.23,34.4,smokes,1
3,1,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked,1
4,0,81.0,0,0,1,Private,Urban,186.21,29.0,formerly smoked,1


In [52]:
df = pd.get_dummies(df, columns = ['work_type', 'Residence_type','smoking_status'])
df.sample(5)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1641,1,78.0,0,1,1,81.99,27.3,0,0,1,0,0,1,0,0,1,0,0
1450,1,33.0,0,0,0,78.34,25.5,0,0,1,0,0,0,1,0,0,1,0
2230,1,56.0,0,0,1,131.63,27.6,0,0,1,0,0,1,0,0,0,1,0
2982,1,61.0,0,0,1,87.52,23.7,0,0,1,0,0,0,1,1,0,0,0
540,0,53.0,0,0,1,79.87,30.9,0,0,1,0,0,1,0,0,0,1,0


In [53]:
X = df.drop(['stroke'], axis = 1)
y = df['stroke']
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)
clf_gini = DecisionTreeClassifier(criterion='gini', random_state=0,max_depth= 5)
clf_gini.fit(X_train, y_train)

In [54]:
y_pred_gini = clf_gini.predict(X_test)
print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

Model accuracy score with criterion gini index: 0.9431


In [41]:
with open('models/moodel.bin', 'wb') as f_out:
    pickle.dump(clf_gini, f_out)

# MLFlow

In [34]:
import mlflow
import pickle
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [35]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-data-experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-data-experiment', tags={}>

In [42]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "in_class")
    mlflow.log_param("train_data_name", "data/full_data.csv")
    mlflow.log_param("validation_data_name", "data/full_data.csv")
    
    max_depth= 5
    mlflow.log_param("max_depth", max_depth)
    
    lr = DecisionTreeClassifier(criterion='gini', random_state=0,max_depth=max_depth)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

In [None]:
best_params = {
        "criterion": 'gini',
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "min_weight_fraction_leaf": 0,
        "max_features": None,
        "random_state": None,
        "max_leaf_nodes": None,
        "min_impurity_decrease": 0,
        "class_weight": None,
        "ccp_alpha": 0
    }

In [37]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [43]:


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-data-experiment")

def objective(params):
    with mlflow.start_run():
        num_boost_round = 500
        early_stopping_rounds = 50
        
        mlflow.log_params(params)
        mlflow.log_param('num_boost_round', num_boost_round)
        mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
        mlflow.log_param('train_data_name', 'green_tripdata_2021-01.parquet')
        mlflow.log_param('validation_data_name', 'green_tripdata_2021-02.parquet')
        mlflow.set_tag('model', 'xgboost')

        booster = xgb.train(
            params = params,
            dtrain = X_test,
            evals = [(y_test, "validation")],
            num_boost_round = num_boost_round,
            early_stopping_rounds = early_stopping_rounds
        )
        
        y_pred = booster.predict(y)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric('rmse', rmse)
        return {'loss': rmse, 'status': STATUS_OK}


In [22]:
from mlflow.tracking import MlflowClient

In [23]:
MLFLOW_URI = "sqlite:///mlflow.db"

In [24]:
client = MlflowClient(MLFLOW_URI)

In [25]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-data-experiment', tags={}>]

In [26]:
client.create_experiment(name = 'new-experimet')

'2'

In [27]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-data-experiment', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='new-experimet', tags={}>]

In [28]:
from mlflow.entities import ViewType

In [29]:
runs = client.search_runs(
    experiment_ids='1',
    run_view_type=ViewType.ACTIVE_ONLY,
    filter_string='metrics.rmse < 7',
    max_results=5,
    order_by=["metrics.rmse ASC"],
)

In [30]:
for run in runs:
    print(f"run_id:{run.info.run_id}, metrics:{run.data.metrics['rmse']}")

run_id:18fae09da277420181e81559cb468160, metrics:0.21538703759407285
