# Data Preprocessing

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

from imblearn.over_sampling import SMOTE

df = pd.read_csv("../healthcare-dataset-stroke-data.csv")


### Column Descriptions :

- id: unique identifier
- gender: "Male", "Female" or "Other"
- age: age of the patient
- hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
- heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
- ever_married: "No" or "Yes"
- work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
- Residence_type: "Rural" or "Urban"
- avg_glucose_level: average glucose level in blood
- bmi: body mass index
- smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
- stroke: 1 if the patient had a stroke or 0 if not

Note: "Unknown" in smoking_status means that the information is unavailable for this patient


In [None]:
# import pandas_profiling as pp
# pp.ProfileReport(df)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df = df.drop("id", axis=1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
plt.title('Missing Value Status')
ax = sns.heatmap(df.isna().sum().to_frame(),annot=True,fmt='d',cmap='YlGnBu')
ax.set_xlabel('Amount Missing')
plt.show()

In [None]:
# fill missing bmi value with linear regression of age and gender
X_bmi = df[['age','gender',"avg_glucose_level", 'bmi']].copy()

le_gender = LabelEncoder()
X_bmi["gender"] =  le_gender.fit_transform(X_bmi["gender"])

Missing = X_bmi[X_bmi["bmi"].isna()]
X_bmi = X_bmi[~X_bmi.bmi.isna()]
Y_bmi = X_bmi.pop('bmi')

linear_reg = LinearRegression()
linear_reg.fit(X_bmi,Y_bmi)

predicted_bmi = pd.Series(linear_reg.predict(Missing[['age','gender',"avg_glucose_level"]]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi
df["bmi"] = df['bmi'].round(decimals = 1)
df.head()

# Exploratory Data Analysis

## Analysis of the whole data set

In [None]:
columns = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']
for i in range(len(columns)):
    print("Unique value of column {column}:".format(column=columns[i]))
    print(df[columns[i]].value_counts(),"\n")

In [None]:
labels =df['stroke'].value_counts(sort = True).index
sizes = df['stroke'].value_counts(sort = True)

colors = ["lightblue","red"]
 
plt.figure(figsize=(8,10))
plt.pie(sizes, explode=(0.05,0), labels=labels, colors=colors, autopct='%1.1f%%', startangle=90,)

plt.title('Number of stroke in the dataset')
plt.show()

In [None]:
def plot_hist(col, bins=30, title="",xlabel="",ax=None):
    sns.histplot(col, bins=bins,ax=ax)
    ax.set_title(f'Histogram of {title}',fontsize=20)
    ax.set_xlabel(xlabel)

fig, axes = plt.subplots(1,3,figsize=(11,7),constrained_layout=True)
plot_hist(df.bmi,
          title='Bmi',
          xlabel="Level of the BMI",
          ax=axes[0])
plot_hist(df.age,
          bins=30,
          title='Age',
          xlabel='Age',
          ax=axes[1])
plot_hist(df.avg_glucose_level,
          title='Serum Creatinine', 
          xlabel='Level of serum creatinine in the blood (mg/dL)',
          ax=axes[2])

plt.show()

In [None]:
sns.catplot(y="work_type", 
            hue="stroke", 
            kind="count",
            palette="pastel", 
            edgecolor=".6",
            data=df)

In [None]:
sns.catplot(y="smoking_status", 
            hue="stroke", 
            kind="count",
            palette="pastel", 
            edgecolor=".6",
            data=df)

In [None]:
plt.figure(figsize=(17,7))
sns.catplot(x="gender", y="stroke", hue="heart_disease", palette="pastel", kind="bar", data=df)
sns.catplot(x="gender", y="stroke", hue="Residence_type", palette="pastel", kind="bar", data=df)
sns.catplot(x="gender", y="stroke", hue="hypertension", palette="pastel", kind="bar", data=df)
plt.show()

In [None]:
len_data = len(df)
len_w = len(df[df["gender"]=="Male"])
len_m = len_data - len_w

men_stroke = len(df.loc[(df["stroke"]==1)&(df['gender']=="Male")])
men_no_stroke = len_m - men_stroke

women_stroke = len(df.loc[(df["stroke"]==1) & (df['gender']=="Female")])
women_no_stroke = len_w - women_stroke

labels = ['Men with stroke','Men healthy','Women with stroke','Women healthy']
values = [men_stroke, men_no_stroke, women_stroke, women_no_stroke]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,textinfo='label+percent',hole=0.4)])
fig.update_layout(
    title_text="Distribution of stroke EVENT according to their gender")
fig.show()

In [None]:
fig = px.parallel_categories(df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status', 'stroke']], color='stroke', color_continuous_scale=px.colors.sequential.Inferno)
fig.show()

In [None]:
le = LabelEncoder()
en_df = df.apply(le.fit_transform)
en_df.head()

features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status']
from matplotlib.offsetbox import AnchoredText
correlation_table = []
for cols in features:
    y = en_df["stroke"]
    x = en_df[cols]
    corr = np.corrcoef(x, y)[1][0]
    dict ={
        'Features': cols,
        'Correlation coefficient' : corr,
        'Feat_type': 'numerical'
    }
    correlation_table.append(dict)
dF1 = pd.DataFrame(correlation_table)
fig = plt.figure(figsize=(10,6))
ax = sns.barplot(x="Correlation coefficient", y="Features", 
                     data=dF1.sort_values("Correlation coefficient", ascending=False),
                     palette='viridis', alpha=0.75)
ax.grid()

title =  'Correlation features with target'
sub_title = 'In comparison with categorical features \
\nnumericals are less correlated with target.'

plt.gcf().text(0.05, 1.02, title, fontsize=24)
#plt.gcf().text(0.05, 0.9, sub_title, fontsize=14)

at1 = AnchoredText(sub_title,
                   loc='lower left', frameon=True,
                   bbox_to_anchor=(-0.1, 1.01),
                   bbox_transform=ax.transAxes,
                   #prop=dict(size=8),
                   )
at1.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at1)

In [None]:
fig, ax = plt.subplots()
corr = en_df.corr()[['stroke']].sort_values(by='stroke', ascending=False)
sns.heatmap(corr, vmin=-1, vmax=1, ax=ax, annot=True, cmap = 'BrBG')
plt.figure(figsize=(8, 12))

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(en_df.corr(),cmap='YlGnBu')

# Prediction

Prediction using KNN

In [None]:
# from sklearn.utils import shuffle
# df = shuffle(df, random_state=1)
# df

### Encode Categorical Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le_gender = LabelEncoder()
le_marriage = LabelEncoder()
le_work = LabelEncoder()
le_residence = LabelEncoder()
le_smoke = LabelEncoder()

In [None]:
df["gender"] =  le_gender.fit_transform(df["gender"])
df["ever_married"] = le_marriage.fit_transform(df["ever_married"])
df["work_type"] = le_work.fit_transform(df["work_type"])
df["Residence_type"] = le_residence.fit_transform(df["Residence_type"])
df["smoking_status"] = le_smoke.fit_transform(df["smoking_status"])

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X

In [None]:
y

### Test and training data splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Standard scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### SMOTE( Synthetic Minority Over-Sampling Technique)

Used to handle imbalance data

ini dipake buat memperbanyak data yang jumlahnya sedikit, makanya row yang stroke (1) jadi 3899

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [None]:
print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of y_train: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier  
best_k = -1
best_test_score = -1

for i in range(1,11):    
    classifier= KNeighborsClassifier(n_neighbors=i, p=1, weights = "uniform")  
#     classifier.fit(X_train, y_train)
    classifier.fit(X_train_res, y_train_res.ravel())
    
    y_pred= classifier.predict(X_test)
    
    print("Neighbour = {k}".format(k = i))
#     print("The training accuracy: {score}".format(score = (classifier.score(X_train,y_train)*100).round(2)))
    print("The training accuracy: {score}".format(score = (classifier.score(X_train_res,y_train_res.ravel())*100).round(2)))
    print('The Test accuracy: {score}'.format(score = (classifier.score(X_test,y_test)*100).round(2)))
    
    if classifier.score(X_test,y_test) > best_test_score:
        best_test_score = classifier.score(X_test, y_test)
        best_k = i

print("\n\nBest k-neighbor= {k}".format(k = best_k))

In [None]:
classifier= KNeighborsClassifier(n_neighbors=best_k, p=2)  
classifier.fit(X_train_res, y_train_res.ravel())
y_pred= classifier.predict(X_test)

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

ax= plt.subplot()

# Confusion matrix menggunakan seaborn

sns.heatmap(cnf_matrix, annot=True, fmt='g', ax=ax, cmap="YlGnBu");  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Not Stroke(0)', 'Stroke(1)']); 
ax.yaxis.set_ticklabels(['Not Stroke(0)', 'Stroke(1)']);

In [None]:
print(classification_report(y_test, y_pred))
print('Accuracy Score: ',accuracy_score(y_test,y_pred))

In [None]:
report = classification_report(y_test, y_pred)
report

In [None]:
X_test

In [3]:
def knn_comparison(data, k):
    x = data[[‘X’,’Y’]].values
    y = data[‘class’].astype(int).values
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    clf.fit(x, y)# Plotting decision region
    plot_decision_regions(x, y, clf=clf, legend=2)# Adding axes annotations
    plt.xlabel(‘X’)
    plt.ylabel(‘Y’)
    plt.title(‘Knn with K=’+ str(k))
    plt.show()

SyntaxError: invalid character '‘' (U+2018) (1168912092.py, line 2)

In [None]:
lalala

# Preparing to export models for web app

In [None]:
# gender 	age 	hypertension 	heart_disease 	ever_married 	work_type 	Residence_type 	avg_glucose_level 	bmi 	smoking_status 	stroke
# Male 	67.0 	0 	1 	Yes 	Private 	Urban 	228.69 	36.6 	formerly smoked 	1

# array = np.array([["Male", 67.0, 1, 1, "Yes", "Private", "Urban", 228.69, 36.6, "smokes"]])

array = np.array([["Male", 80, 1, 1, "Yes", "Private", "Urban", 228.69, 40, "smokes"]])

array[:, 0] =  le_gender.transform(array[:, 0])
array[:, 4] = le_marriage.transform(array[:, 4])
array[:, 5] = le_work.transform(array[:, 5])
array[:, 6] = le_residence.transform(array[:,6])
array[:, 9] = le_smoke.transform(array[:,9])

index_values = [0]
column_values = ["gender", "age", "hypertension", "heart_disease","ever_married", "work_type", "Residence_type","avg_glucose_level", "bmi", "smoking_status"]
X = pd.DataFrame(data = array, 
                 index = index_values,
                columns = column_values)
# X = pd.DataFrame(data = array, 
#                  index = index_values,
#                 )

X = sc.transform(X)
X

In [None]:
y_pred = classifier.predict(X)
y_pred

In [None]:
import pickle
import joblib

In [None]:
data = {"model": classifier, 
        "le_gender": le_gender, 
        "le_marriage": le_marriage, 
        "le_work": le_work, 
        "le_residence": le_residence,
        "le_smoke": le_smoke
       }

with open("model_knn.pkl", "wb") as file:
    pickle.dump(data,file)

joblib.dump(sc, "scaler.pkl") 

In [None]:
with open("model_knn.pkl", "rb") as file:
    data = pickle.load(file)
    
sc = joblib.load("scaler.pkl")

classifier_loaded = data["model"]
le_gender = data["le_gender"]
le_marriage = data["le_marriage"]
le_work = data["le_work"]
le_residence = data["le_residence"]
le_smoke = data["le_smoke"]

array = np.array([["Male", 50, 1, 1, "Yes", "Private", "Urban", 228.69, 30, "smokes"]])

array[:, 0] =  le_gender.transform(array[:, 0])
array[:, 4] = le_marriage.transform(array[:, 4])
array[:, 5] = le_work.transform(array[:, 5])
array[:, 6] = le_residence.transform(array[:,6])
array[:, 9] = le_smoke.transform(array[:,9])

index_values = [0]
column_values = ["gender", "age", "hypertension", "heart_disease","ever_married", "work_type", "Residence_type","avg_glucose_level", "bmi", "smoking_status"]

X = pd.DataFrame(data = array, 
                 index = index_values,
                columns = column_values)
# X = pd.DataFrame(data = array, 
#                  index = index_values,
#                 )
X = sc.transform(X)
X

In [None]:
y_pred = classifier_loaded.predict(X)
y_pred