In [1]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Read the CSV and Perform Basic Data Cleaning

In [2]:
# Create a DataFrame for the healthcare-dataset-stroke-data.csv. 
stroke_df = pd.read_csv('../resources/healthcare-dataset-stroke-data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
#  Get a list of all columns within the DataFrame.
stroke_df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
# Check the datatypes of your columns.
stroke_df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
# Identify all the possible values in the gender column
stroke_df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [6]:
# Convert these values into numerical ones with the pd.get_dummies() method
stroke_encoded = pd.get_dummies(stroke_df, columns=["gender", "ever_married", "work_type", "Residence_type", "smoking_status"])
stroke_encoded.head()


Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,202.21,,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [7]:
# Check the datatypes of your columns.
stroke_encoded.dtypes

id                                  int64
age                               float64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

## convert age, avg_glucose_level, and bmi columns to int


In [8]:
# convert age columns to int
stroke_encoded['age'] = stroke_encoded['age'].astype('int')

# Check the datatypes of your columns.
stroke_encoded.dtypes

id                                  int64
age                                 int64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [9]:
# convert avg_glucose_level columns to int

stroke_encoded['avg_glucose_level'] = stroke_encoded['avg_glucose_level'].astype('int')

# Check the datatypes of your columns.
stroke_encoded.dtypes

id                                  int64
age                                 int64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                   int64
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [10]:
# Identify incomplete rows
stroke_encoded.count()

id                                5110
age                               5110
hypertension                      5110
heart_disease                     5110
avg_glucose_level                 5110
bmi                               4909
stroke                            5110
gender_Female                     5110
gender_Male                       5110
gender_Other                      5110
ever_married_No                   5110
ever_married_Yes                  5110
work_type_Govt_job                5110
work_type_Never_worked            5110
work_type_Private                 5110
work_type_Self-employed           5110
work_type_children                5110
Residence_type_Rural              5110
Residence_type_Urban              5110
smoking_status_Unknown            5110
smoking_status_formerly smoked    5110
smoking_status_never smoked       5110
smoking_status_smokes             5110
dtype: int64

In [11]:
# Drop all rows with missing information
stroke_encoded = stroke_encoded.dropna(how='any')

In [12]:
# Verify dropped rows
stroke_encoded.count()

id                                4909
age                               4909
hypertension                      4909
heart_disease                     4909
avg_glucose_level                 4909
bmi                               4909
stroke                            4909
gender_Female                     4909
gender_Male                       4909
gender_Other                      4909
ever_married_No                   4909
ever_married_Yes                  4909
work_type_Govt_job                4909
work_type_Never_worked            4909
work_type_Private                 4909
work_type_Self-employed           4909
work_type_children                4909
Residence_type_Rural              4909
Residence_type_Urban              4909
smoking_status_Unknown            4909
smoking_status_formerly smoked    4909
smoking_status_never smoked       4909
smoking_status_smokes             4909
dtype: int64

In [13]:
# convert bmi columns to int
stroke_encoded['bmi'] = stroke_encoded['bmi'].astype('int')


In [14]:
# Check the datatypes of your columns.
stroke_encoded.dtypes

id                                int64
age                               int64
hypertension                      int64
heart_disease                     int64
avg_glucose_level                 int64
bmi                               int64
stroke                            int64
gender_Female                     uint8
gender_Male                       uint8
gender_Other                      uint8
ever_married_No                   uint8
ever_married_Yes                  uint8
work_type_Govt_job                uint8
work_type_Never_worked            uint8
work_type_Private                 uint8
work_type_Self-employed           uint8
work_type_children                uint8
Residence_type_Rural              uint8
Residence_type_Urban              uint8
smoking_status_Unknown            uint8
smoking_status_formerly smoked    uint8
smoking_status_never smoked       uint8
smoking_status_smokes             uint8
dtype: object

In [15]:
# save stroke_encoded as csv file.
stroke_encoded.to_csv('resources/clean_stroke_df.csv')


In [16]:
# Display a statistical overview of the DataFrame
stroke_encoded.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,...,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,42.854145,0.091872,0.049501,104.816052,28.448564,0.042575,0.590141,0.409656,0.000204,...,0.004482,0.572622,0.157873,0.136688,0.492768,0.507232,0.302098,0.170503,0.377266,0.150132
std,20995.098457,22.575986,0.288875,0.216934,44.417384,7.843834,0.201917,0.491858,0.49182,0.014273,...,0.066801,0.494748,0.364659,0.343552,0.499999,0.499999,0.459214,0.376113,0.484752,0.357238
min,77.0,0.0,0.0,0.0,55.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18605.0,25.0,0.0,0.0,77.0,23.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37608.0,44.0,0.0,0.0,91.0,28.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,55220.0,60.0,0.0,0.0,113.0,33.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,72940.0,82.0,1.0,1.0,271.0,97.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
