# LIBRERIAS

In [None]:
# CARGAR LIBRERIAS

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, confusion_matrix

#from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
#from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

# DATOS

URL = https://www.kaggle.com/fedesoriano/stroke-prediction-dataset

**Context**

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

**Attribute Information**

1) id: unique identifier

2) gender: "Male", "Female" or "Other"

3) age: age of the patient

4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6) ever_married: "No" or "Yes"

7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8) Residence_type: "Rural" or "Urban"

9) avg_glucose_level: average glucose level in blood

10) bmi: body mass index

11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12) stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [None]:
# CARGAR BASE DE DATOS

df = pd.read_csv('../BBDD/healthcare-dataset-stroke-data.csv')
df.head()

# ANALISIS EXPLORATORIO DE DATOS

In [None]:
df.info()

In [None]:
# BORRAR COLUMNA ID

df.drop(columns='id', inplace=True)

In [None]:
# COMPROBAR NANS Y SUS PORCENTAJES

print(df.isnull().sum())
print()
print(df.isnull().sum() / len(df) * 100)

In [None]:
# COMPROBAR DUPLICADOS

df.duplicated().sum()

In [None]:
# ESTADISTICOS DEL DATAFRAME

round(df.describe(include='all'), 2)

In [None]:
# LISTA DE COLUMNAS

df.columns

In [None]:
# DISTRIBUCION DE LAS VARIABLES

fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 10))
axes = axes.flat
columnas = df.columns

for i, col in enumerate(columnas):
    
    sns.histplot(
        data=df,
        x=col,
        stat="count",
        kde=True,
        color=(list(plt.rcParams['axes.prop_cycle'])*2)[i]["color"],
        line_kws={'linewidth': 2},
        alpha= 0.3,
        ax=axes[i]
    )
    
    axes[i].set_title(col, fontsize=7, fontweight="bold")
    axes[i].tick_params(labelsize=6)
    axes[i].set_xlabel("")

fig.delaxes(axes[11]) # BORRAR FIGURA EN POSICION 11
    
fig.tight_layout()
plt.subplots_adjust(top =0.9)
fig.suptitle('Distribución variables', fontsize=10, fontweight="bold");

In [None]:
# DISTRIBUCION DE STROKE CON VARIABLES CATEGORICAS

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
axes = axes.flat
columnas = df[['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status']]

for i, col in enumerate(columnas):
    
    sns.countplot(
        data=df,
        x=col,
        hue='stroke',
        alpha= 0.3,
        ax=axes[i]
    )
    
    axes[i].set_title(col, fontsize=7, fontweight="bold")
    axes[i].tick_params(labelsize=6)
    axes[i].set_xlabel("")
   
fig.delaxes(axes[7]) # BORRAR FIGURA EN POSICION 7

fig.tight_layout()
plt.subplots_adjust(top =0.9)
fig.suptitle('Distribución variables categoricas segun stroke', fontsize=10, fontweight="bold");

In [None]:
# DISTRIBUCION DE STROKE CON VARIABLES NUMERICAS

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 15))
axes = axes.flat
columnas = df[['age', 'avg_glucose_level', 'bmi']]

for i, col in enumerate(columnas):
    
    sns.kdeplot(columnas[col][(df['stroke'] == 0)], color='blue', shade=True, ax=axes[i])
    sns.kdeplot(df[col][(df['stroke'] == 1)], color='red', shade=True, ax=axes[i])
    axes[i].set_title(col, fontsize=7, fontweight="bold")
    axes[i].tick_params(labelsize=6)
    axes[i].set_xlabel("")
    axes[i].legend(['No', 'Si'])

fig.tight_layout()
plt.subplots_adjust(top =0.9)
fig.suptitle('Distribución variables numericas segun stroke', fontsize=10, fontweight="bold");

In [None]:
# CORRELACION DE STROKE CON VARIABLES NUMERICAS

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes = axes.flat
columnas = df[['age', 'avg_glucose_level', 'bmi']]

for i, col in enumerate(columnas):
    
    sns.scatterplot(columnas[col], df['stroke'], hue=df['stroke'], ax=axes[i])
    axes[i].set_title(col, fontsize=7, fontweight="bold")
    axes[i].tick_params(labelsize=6)
    axes[i].set_xlabel("")

fig.tight_layout()
plt.subplots_adjust(top =0.9)
fig.suptitle('Distribución variables numericas segun stroke', fontsize=10, fontweight="bold");

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
axes = axes.flat
columnas = df[['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status']]

for i, col in enumerate(columnas):
    
    sns.violinplot(
        x     = col,
        y     = 'stroke',
        data  = df,
        ax    = axes[i]
    )
    
    axes[i].set_title(f"stroke vs {col}", fontsize = 7, fontweight = "bold")
    axes[i].tick_params(labelsize = 6)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")

fig.delaxes(axes[7]) # BORRAR FIGURA EN POSICION 7

fig.tight_layout()
plt.subplots_adjust(top =0.9)
fig.suptitle('Distribución de stroke por grupo', fontsize=10, fontweight="bold");

In [None]:
# CORRELACION DE LAS VARIABLES

plt.figure(figsize=(15, 5))
sns.heatmap(df.corr(),
           vmin = -1,
           vmax = 1,
           cmap=sns.color_palette("coolwarm", as_cmap=True),
           square = True,
           linewidths = 0.5,
           annot=True)

plt.xticks(rotation=-60);

# PREPROCESADO DE DATOS

In [None]:
# TOTAL DE MISSINGS EN STROKE = 1

print(df[df['stroke'] == 1].count())
print()
print(df[df['stroke'] == 1].isnull().sum())

In [None]:
# SUSTITUIR EN COLUMNA 'GENDER' FEMALE POR 0 Y MALE POR 1

df.loc[df['gender'] == 'Female', 'gender'] = 0
df.loc[df['gender'] == 'Male', 'gender'] = 1

# SUSTITUIR EN COLUMNA 'EVER_MARRIED' NO POR 0 Y YES POR 1

df.loc[df['ever_married'] == 'No', 'ever_married'] = 0
df.loc[df['ever_married'] == 'Yes', 'ever_married'] = 1

# SUSTITUIR EN COLUMNA 'RESIDENCE TYPE' RURAL POR 0 Y URBAN POR 1

df.loc[df['Residence_type'] == 'Rural', 'Residence_type'] = 0
df.loc[df['Residence_type'] == 'Urban', 'Residence_type'] = 1

In [None]:
# COMPROBAR CON UNA MASCARA SI HAY ALGUNA FILA CON GENDER 'OTHER'

df[df['gender'] == 'Other']

In [None]:
# BORRAR FILA CON GENDER 'OTHER'

df.drop(index= 3116, inplace=True)

# RESETEAR LOS VALORES DEL INDICE

df.reset_index(drop = True, inplace = True)

In [None]:
# SEPARAR EN DISTINTAS COLUMNAS LAS DIFERENTES CATEGORIAS DE 'WORK TYPE' Y 'SMOKING STATUS'

df = pd.get_dummies(data=df, columns=['work_type', 'smoking_status'])

In [None]:
# CAMBIAR TIPOS EN COLUMNAS

df = df.astype({
    'gender': 'int64',
    'age': 'int64',
    'ever_married': 'int64',
    'Residence_type':'int64'
})

In [None]:
# RENOMBRAR COLUMNA RESIDENCE_TYPE

df.rename(columns={'Residence_type': 'residence_type'}, inplace=True)

In [None]:
# REORDENAR COLUMNAS

df = df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes', 'stroke']]

In [None]:
df.info()

# TRAIN/TEST SPLIT

In [None]:
# DIVIDIR EN TRAIN Y TEST

X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# TAMAÑOS DE SPLITS Y TOTAL DE 1(STROKE) EN Y_TRAIN Y EN Y_TEST

print('Tamaños')
print('-------')
print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)
print()
print('Total de stroke en splits')
print('-------------------------')
print('y_train:', y_train[y_train == 1].count())
print('y_test:', y_test[y_test == 1].count())

# FEATURE ENGINEERING

In [None]:
print('Train BMI missings')
print('------------------')
print(X_train.bmi.isnull().sum())
print()
print('Test BMI missings')
print('-----------------')
print(X_test.bmi.isnull().sum())

In [None]:
# IMPUTAR MISSINGS MEDIANTE KNN

imputer = KNNImputer(n_neighbors=5)

imputer.fit(X_train[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']])

X_train[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']] = imputer.transform(X_train[['gender', 'age', 'hypertension',                'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']])

In [None]:
# COMPROBAR QUE NO HAY MISSINGS

X_train.isnull().sum()

In [None]:
# BALANCEAR TARGET CON SMOTE

from collections import Counter

counter = Counter(y_train)
print('Antes de SMOTE', counter)

smt = SMOTE(random_state=42)
X_train_smt, y_train_smt = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_smt)
print('Despues de SMOTE', counter)

In [None]:
# IMPUTAR MISSINGS MEDIANTE KNN

X_test[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']] = imputer.transform(X_test[['gender', 'age', 'hypertension',                'heart_disease', 'ever_married',
       'residence_type', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 
       'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 
       'work_type_children', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes']])

# MODELOS

#### LOGISTIC REGRESSION

In [None]:
# LOGISTIC REGRESSION

log_reg = LogisticRegression()

parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 0.5, 1, 5, 10],
    'max_iter': [50,100,500],
    'solver': ['sag']
}

clf = GridSearchCV(estimator= log_reg, param_grid= parameters, n_jobs=-1, cv=10, verbose=2, scoring='roc_auc')

clf.fit(X_train_smt, y_train_smt)

print(clf.best_params_)
print()
print('Best score:', clf.best_score_)

log_model = clf.best_estimator_

In [None]:
log_model.score(X_test, y_test)

#### RANDOM FOREST

In [None]:
# RANDOM FOREST

rf = RandomForestClassifier()

parameters = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2,3],
    'max_features': [1, 2, 3]
}

clf = GridSearchCV(estimator= rf, param_grid= parameters, n_jobs=-1, cv=10, verbose=2, scoring='roc_auc')

clf.fit(X_train_smt, y_train_smt)

print(clf.best_params_)
print()
print('Best score:', clf.best_score_)

rf_model = clf.best_estimator_

In [None]:
rf_model.score(X_test, y_test)

#### XGBOOST

In [None]:
# XGBOOST

xgb = XGBClassifier()

parameters = {
    'n_estimators': [90, 91, 92, 93, 94, 95],
    'booster': ['gbtree'],
    'max_depth': [2],
    'learning_rate': [0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
}

clf = GridSearchCV(estimator= xgb, param_grid= parameters, n_jobs=-1, cv=10, verbose=2, scoring='roc_auc')

clf.fit(X_train_smt, y_train_smt)

print(clf.best_params_)
print()
print('Best score:', clf.best_score_)

xgb_model = clf.best_estimator_

In [None]:
xgb_model.score(X_test, y_test)