In [1]:
from sklearn import preprocessing

import matplotlib as plot
import numpy as np
import pandas as pd


In [2]:
datafile = 'StrokeData.csv'
strokeDataSet = pd.read_csv(datafile)
strokeDataSet = strokeDataSet.dropna().reset_index(drop=True)
strokeDataSet = strokeDataSet[strokeDataSet.gender != "Other"]
strokeDataSet

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4904,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
def NormalizeCategories(dataset):
    
    # Syntax: df.loc[ df["column_name"] == "some_value", "column_name"] = "value"
    
    # Convert Genders
    dataset.loc[dataset["gender"] == "Male", "gender"] = 1
    dataset.loc[dataset["gender"] == "Female", "gender"] = 0  
    
    # Convert Ever_married
    dataset.loc[dataset["ever_married"] == "Yes", "ever_married"] = 1
    dataset.loc[dataset["ever_married"] == "No", "ever_married"] = 0  
    
    # Convert work_type
    dataset.loc[dataset["work_type"] == "Never_worked", "work_type"] = 0
    dataset.loc[dataset["work_type"] == "Private", "work_type"] = 1 
    dataset.loc[dataset["work_type"] == "Govt_job", "work_type"] = 2
    dataset.loc[dataset["work_type"] == "children", "work_type"] = 3  
    dataset.loc[dataset["work_type"] == "Self-employed", "work_type"] = 4
    
    # Convert Residence_Type
    dataset.loc[dataset["Residence_type"] == "Urban", "Residence_type"] = 1
    dataset.loc[dataset["Residence_type"] == "Rural", "Residence_type"] = 0  

    # Convert Smoking_status
    dataset.loc[dataset["smoking_status"] == "never smoked", "smoking_status"] = 0
    dataset.loc[dataset["smoking_status"] == "formerly smoked", "smoking_status"] = 1
    dataset.loc[dataset["smoking_status"] == "smokes", "smoking_status"] = 2
    dataset.loc[dataset["smoking_status"] == "Unknown", "smoking_status"] = 3

    

In [4]:
def NormalizeDatset(dataset):

    for column in dataset.columns:
        dataset[column] = dataset[column].astype(float)
        print("\033[1m" + "{}".format(column) + "\033[0m" + " converted to float")
        dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())

In [5]:
def FindNaN(dataset):
    return dataset[dataset.isna().any(axis=1)]

In [6]:
# get the columns from 0 to 10
features = pd.DataFrame
features = strokeDataSet.drop(['id', 'stroke'], axis = 1)
features

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked
...,...,...,...,...,...,...,...,...,...,...
4904,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown
4905,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked
4906,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked
4907,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked


In [7]:
classification = pd.DataFrame
classification = strokeDataSet[['stroke']].copy()
classification.head(15)

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [8]:
FindNaN(features)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status


In [9]:
NormalizeCategories(features)
features.head(20)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,1,1,228.69,36.6,1
1,1,80.0,0,1,1,1,0,105.92,32.5,0
2,0,49.0,0,0,1,1,1,171.23,34.4,2
3,0,79.0,1,0,1,4,0,174.12,24.0,0
4,1,81.0,0,0,1,1,1,186.21,29.0,1
5,1,74.0,1,1,1,1,0,70.09,27.4,0
6,0,69.0,0,0,0,1,1,94.39,22.8,0
7,0,78.0,0,0,1,1,1,58.57,24.2,3
8,0,81.0,1,0,1,1,0,80.43,29.7,0
9,0,61.0,0,1,1,2,0,120.46,36.8,2


In [10]:
NormalizeDatset(features)
features.dtypes

[1mgender[0m converted to float
[1mage[0m converted to float
[1mhypertension[0m converted to float
[1mheart_disease[0m converted to float
[1mever_married[0m converted to float
[1mwork_type[0m converted to float
[1mResidence_type[0m converted to float
[1mavg_glucose_level[0m converted to float
[1mbmi[0m converted to float
[1msmoking_status[0m converted to float


gender               float64
age                  float64
hypertension         float64
heart_disease        float64
ever_married         float64
work_type            float64
Residence_type       float64
avg_glucose_level    float64
bmi                  float64
smoking_status       float64
dtype: object

In [11]:
features

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1.0,0.816895,0.0,1.0,1.0,0.25,1.0,0.801265,0.301260,0.333333
1,1.0,0.975586,0.0,1.0,1.0,0.25,0.0,0.234512,0.254296,0.000000
2,0.0,0.597168,0.0,0.0,1.0,0.25,1.0,0.536008,0.276060,0.666667
3,0.0,0.963379,1.0,0.0,1.0,1.00,0.0,0.549349,0.156930,0.000000
4,1.0,0.987793,0.0,0.0,1.0,0.25,1.0,0.605161,0.214204,0.333333
...,...,...,...,...,...,...,...,...,...,...
4904,0.0,0.157715,0.0,0.0,0.0,0.75,0.0,0.221402,0.095074,1.000000
4905,0.0,0.987793,0.0,0.0,1.0,1.00,1.0,0.323516,0.340206,0.000000
4906,0.0,0.426270,0.0,0.0,1.0,1.00,0.0,0.128658,0.232532,0.000000
4907,1.0,0.621582,0.0,0.0,1.0,0.25,0.0,0.513203,0.175258,0.333333


In [12]:
strokeNormalize = features
strokeNormalize

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1.0,0.816895,0.0,1.0,1.0,0.25,1.0,0.801265,0.301260,0.333333
1,1.0,0.975586,0.0,1.0,1.0,0.25,0.0,0.234512,0.254296,0.000000
2,0.0,0.597168,0.0,0.0,1.0,0.25,1.0,0.536008,0.276060,0.666667
3,0.0,0.963379,1.0,0.0,1.0,1.00,0.0,0.549349,0.156930,0.000000
4,1.0,0.987793,0.0,0.0,1.0,0.25,1.0,0.605161,0.214204,0.333333
...,...,...,...,...,...,...,...,...,...,...
4904,0.0,0.157715,0.0,0.0,0.0,0.75,0.0,0.221402,0.095074,1.000000
4905,0.0,0.987793,0.0,0.0,1.0,1.00,1.0,0.323516,0.340206,0.000000
4906,0.0,0.426270,0.0,0.0,1.0,1.00,0.0,0.128658,0.232532,0.000000
4907,1.0,0.621582,0.0,0.0,1.0,0.25,0.0,0.513203,0.175258,0.333333


In [13]:
strokeNormalize = strokeNormalize.assign(classification = classification)
strokeNormalize

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,classification
0,1.0,0.816895,0.0,1.0,1.0,0.25,1.0,0.801265,0.301260,0.333333,1
1,1.0,0.975586,0.0,1.0,1.0,0.25,0.0,0.234512,0.254296,0.000000,1
2,0.0,0.597168,0.0,0.0,1.0,0.25,1.0,0.536008,0.276060,0.666667,1
3,0.0,0.963379,1.0,0.0,1.0,1.00,0.0,0.549349,0.156930,0.000000,1
4,1.0,0.987793,0.0,0.0,1.0,0.25,1.0,0.605161,0.214204,0.333333,1
...,...,...,...,...,...,...,...,...,...,...,...
4904,0.0,0.157715,0.0,0.0,0.0,0.75,0.0,0.221402,0.095074,1.000000,0
4905,0.0,0.987793,0.0,0.0,1.0,1.00,1.0,0.323516,0.340206,0.000000,0
4906,0.0,0.426270,0.0,0.0,1.0,1.00,0.0,0.128658,0.232532,0.000000,0
4907,1.0,0.621582,0.0,0.0,1.0,0.25,0.0,0.513203,0.175258,0.333333,0


In [15]:
strokeNormalize.to_csv('Stroke_Dataset_Normalized.csv')