In [2]:
import numpy as np
import pandas as pd

from sklearn.impute import KNNImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

df = pd.read_csv('healthcare-dataset-stroke-data.csv') 

mapStore = dict()
minMaxScaler = MinMaxScaler()

<b>-------------------------------Imputer Function Defintions-----------------------------</b>

In [3]:
def find_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].dropna().unique(), 0)}

def integer_encode(df , variable, ordinal_mapping):
    df[variable] = df[variable].map(ordinal_mapping)

def imputation(df1 , cols):
    df = df1.copy()
    
    #Add MApping
    for variable in cols:
        mappings = find_category_mappings(df, variable)
        mapStore[variable] = mappings
        
    #Apply mapping
    for variable in cols:
        integer_encode(df, variable, mapStore[variable]) 
        
    #Minmaxscaler and KNN imputation 
    sca = minMaxScaler.fit_transform(df)
    knn_imputer = KNNImputer()
    knn = knn_imputer.fit_transform(sca)
    df.iloc[:,:] = minMaxScaler.inverse_transform(knn)
    for col in cols : 
        df[col] = round(df[col]).astype('int')
    return df

def mapCategories(df, cols):
    for variable in cols:
        mappings = find_category_mappings(df, variable)
        mapStore[variable] = mappings

def unMapCategories(df, cols):
    #Inverse transform
    for i in cols:
        inv_map = {v: k for k, v in mapStore[i].items()}
        df[i] = df[i].map(inv_map)
    return df

def encodeCategoryFeature(df:pd.DataFrame, label:str):
    from sklearn.preprocessing import LabelEncoder
    lbl = LabelEncoder() # init label encoder
    y_lbl = lbl.fit_transform(df[label]) # encode target variable
    label_map = dict(zip(df[label], y_lbl)) # get the mapping between the original labels and encoded labels
    df[label] = y_lbl
    return label_map

def decodeDataFrame(df, cols):
    #Inverse transform
    for i in cols:
        inv_map = {v: k for k, v in mapStore[i].items()}
        df[i] = df[i].map(inv_map)
    return df

<b>----------------------Clean & Remove Unwanted Values--------------------------</b>

In [5]:
# ------------------------Gender ----------------------------#
# 1. There were 1 patients who were categorized as ‘Other’ in the gender column. 
# 2. They were dropped because their size was insignificant to the dataset 
genderRows = df[df['gender'] == 'Other'].index 
df.drop(genderRows, inplace=True)

<b>----------------Encode & Map Category Features-------------------------</b>

In [6]:
label_map = encodeCategoryFeature(df, label='gender')
mapStore['gender'] = label_map

label_map = encodeCategoryFeature(df, label='ever_married')
mapStore['ever_married'] = label_map

label_map = encodeCategoryFeature(df, label='work_type')
mapStore['work_type'] = label_map

label_map = encodeCategoryFeature(df, label='Residence_type')
mapStore['Residence_type'] = label_map

#label_map = encodeCategoryFeature(df, label='smoking_status')
#mapStore['smoking_status'] = label_map

In [7]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,formerly smoked,1
1,51676,0,61.0,0,0,1,3,0,202.21,,never smoked,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,never smoked,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,smokes,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,never smoked,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,formerly smoked,1
6,53882,1,74.0,1,1,1,2,0,70.09,27.4,never smoked,1
7,10434,0,69.0,0,0,0,2,1,94.39,22.8,never smoked,1
8,27419,0,59.0,0,0,1,2,0,76.15,,Unknown,1
9,60491,0,78.0,0,0,1,2,1,58.57,24.2,Unknown,1


<b>----------------Fix Categorical Features VAlues------------------------------</b>

In [8]:
#Convert smoking_status 'unknown' as N/A
df['smoking_status'].mask(df['smoking_status'] == 'Unknown', np.nan, inplace=True)

In [58]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,formerly smoked,1
1,51676,0,61.0,0,0,1,3,0,202.21,,never smoked,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,never smoked,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,smokes,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,never smoked,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,formerly smoked,1
6,53882,1,74.0,1,1,1,2,0,70.09,27.4,never smoked,1
7,10434,0,69.0,0,0,0,2,1,94.39,22.8,never smoked,1
8,27419,0,59.0,0,0,1,2,0,76.15,,,1
9,60491,0,78.0,0,0,1,2,1,58.57,24.2,,1


<b>-----------------------Missing Values (BMI)----------------------------------</b>

In [9]:
mean_value = df['bmi'].mean()
df['bmi'].fillna(mean_value, inplace = True)

In [60]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,formerly smoked,1
1,51676,0,61.0,0,0,1,3,0,202.21,28.893237,never smoked,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,never smoked,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,smokes,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,never smoked,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,formerly smoked,1
6,53882,1,74.0,1,1,1,2,0,70.09,27.4,never smoked,1
7,10434,0,69.0,0,0,0,2,1,94.39,22.8,never smoked,1
8,27419,0,59.0,0,0,1,2,0,76.15,28.893237,,1
9,60491,0,78.0,0,0,1,2,1,58.57,24.2,,1


<b>-----------------------Missing Values (Smoking Status)--------------------------------------</b>

In [10]:
# Impute Smoking Status with KNN Algo
df=imputation(df,['smoking_status'])

In [79]:
mapStore

{'gender': {'Male': 1, 'Female': 0, 'Other': 2},
 'ever_married': {'Yes': 1, 'No': 0},
 'work_type': {'Private': 2,
  'Self-employed': 3,
  'Govt_job': 0,
  'children': 4,
  'Never_worked': 1},
 'Residence_type': {'Urban': 1, 'Rural': 0},
 'smoking_status': {'formerly smoked': 0, 'never smoked': 1, 'smokes': 2}}

In [80]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046.0,1.0,67.0,0.0,1.0,1.0,2.0,1.0,228.69,36.6,0,1.0
1,51676.0,0.0,61.0,0.0,0.0,1.0,3.0,0.0,202.21,28.893237,1,1.0
2,31112.0,1.0,80.0,0.0,1.0,1.0,2.0,0.0,105.92,32.5,1,1.0
3,60182.0,0.0,49.0,0.0,0.0,1.0,2.0,1.0,171.23,34.4,2,1.0
4,1665.0,0.0,79.0,1.0,0.0,1.0,3.0,0.0,174.12,24.0,1,1.0
5,56669.0,1.0,81.0,0.0,0.0,1.0,2.0,1.0,186.21,29.0,0,1.0
6,53882.0,1.0,74.0,1.0,1.0,1.0,2.0,0.0,70.09,27.4,1,1.0
7,10434.0,0.0,69.0,0.0,0.0,0.0,2.0,1.0,94.39,22.8,1,1.0
8,27419.0,0.0,59.0,0.0,0.0,1.0,2.0,0.0,76.15,28.893237,1,1.0
9,60491.0,0.0,78.0,0.0,0.0,1.0,2.0,1.0,58.57,24.2,0,1.0


In [81]:
#Check for any null values 
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
df.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046.0,1.0,67.0,0.0,1.0,1.0,2.0,1.0,228.69,36.6,0,1.0
1,51676.0,0.0,61.0,0.0,0.0,1.0,3.0,0.0,202.21,28.893237,1,1.0
2,31112.0,1.0,80.0,0.0,1.0,1.0,2.0,0.0,105.92,32.5,1,1.0
3,60182.0,0.0,49.0,0.0,0.0,1.0,2.0,1.0,171.23,34.4,2,1.0
4,1665.0,0.0,79.0,1.0,0.0,1.0,3.0,0.0,174.12,24.0,1,1.0
5,56669.0,1.0,81.0,0.0,0.0,1.0,2.0,1.0,186.21,29.0,0,1.0
6,53882.0,1.0,74.0,1.0,1.0,1.0,2.0,0.0,70.09,27.4,1,1.0
7,10434.0,0.0,69.0,0.0,0.0,0.0,2.0,1.0,94.39,22.8,1,1.0
8,27419.0,0.0,59.0,0.0,0.0,1.0,2.0,0.0,76.15,28.893237,1,1.0
9,60491.0,0.0,78.0,0.0,0.0,1.0,2.0,1.0,58.57,24.2,0,1.0


In [11]:
#Get Original Dataset - Run only once - Horrible Bug 
dfEncoded = decodeDataFrame(df, ['smoking_status', 'gender', 'ever_married', 'work_type', 'Residence_type'])
dfEncoded.head(20)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046.0,Male,67.0,0.0,1.0,Yes,Private,Urban,228.69,36.6,formerly smoked,1.0
1,51676.0,Female,61.0,0.0,0.0,Yes,Self-employed,Rural,202.21,28.89456,never smoked,1.0
2,31112.0,Male,80.0,0.0,1.0,Yes,Private,Rural,105.92,32.5,never smoked,1.0
3,60182.0,Female,49.0,0.0,0.0,Yes,Private,Urban,171.23,34.4,smokes,1.0
4,1665.0,Female,79.0,1.0,0.0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1.0
5,56669.0,Male,81.0,0.0,0.0,Yes,Private,Urban,186.21,29.0,formerly smoked,1.0
6,53882.0,Male,74.0,1.0,1.0,Yes,Private,Rural,70.09,27.4,never smoked,1.0
7,10434.0,Female,69.0,0.0,0.0,No,Private,Urban,94.39,22.8,never smoked,1.0
8,27419.0,Female,59.0,0.0,0.0,Yes,Private,Rural,76.15,28.89456,never smoked,1.0
9,60491.0,Female,78.0,0.0,0.0,Yes,Private,Urban,58.57,24.2,formerly smoked,1.0


In [12]:
# Did KnnImpute work well 
dfEncoded.loc[(df['id'] == 27419) | (df['id'] == 32257) | (df['id'] == 17739) | (df['id'] == 16934) | (df['id'] == 54921) ]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
8,27419.0,Female,59.0,0.0,0.0,Yes,Private,Rural,76.15,28.89456,never smoked,1.0
208,54921.0,Male,78.0,1.0,0.0,Yes,Self-employed,Rural,134.8,33.6,never smoked,1.0
244,17739.0,Male,57.0,0.0,0.0,Yes,Private,Rural,84.96,36.7,formerly smoked,1.0
254,32257.0,Female,47.0,0.0,0.0,Yes,Private,Urban,210.95,50.1,never smoked,0.0
334,16934.0,Female,51.0,0.0,0.0,Yes,Self-employed,Rural,89.84,29.9,never smoked,0.0


In [14]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)