In [1]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Read the CSV and Perform Basic Data Cleaning

In [2]:
# Create a DataFrame for the healthcare-dataset-stroke-data.csv. 
path = Path('../resources/healthcare-dataset-stroke-data.csv')
stroke_df = pd.read_csv(path)
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
#  Get a list of all columns within the DataFrame.
stroke_df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
# Check the datatypes of your columns and other dataframe features.
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
# remove unnecessary columns that would not contribute to machine learning model
stroke_df = stroke_df.drop(labels="id", axis=1)
stroke_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [13]:
# number of unique values in each column
print(stroke_df.nunique())
print('--------------------')
print(stroke_df.dtypes)

gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64
--------------------
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [11]:
# Identify all the unique values in the gender column
stroke_df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [14]:
# Convert these values into numerical ones with the pd.get_dummies() method
stroke_encoded = pd.get_dummies(stroke_df, columns=["gender", "ever_married", "work_type", "Residence_type", "smoking_status"])
stroke_encoded.head()


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0


In [15]:
# Check the datatypes of your columns.
stroke_encoded.dtypes

age                               float64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

## convert age, avg_glucose_level, and bmi columns to int


In [17]:
# convert age columns to int
stroke_encoded['age'] = stroke_encoded['age'].astype('int64')

# Check the datatypes of your columns.
stroke_encoded.dtypes

age                                 int64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [18]:
# convert avg_glucose_level columns to int

stroke_encoded['avg_glucose_level'] = stroke_encoded['avg_glucose_level'].astype('int')

# Check the datatypes of your columns.
stroke_encoded.dtypes

age                                 int64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                   int32
bmi                               float64
stroke                              int64
gender_Female                       uint8
gender_Male                         uint8
gender_Other                        uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Never_worked              uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [19]:
# Identify incomplete rows
stroke_encoded.count()

age                               5110
hypertension                      5110
heart_disease                     5110
avg_glucose_level                 5110
bmi                               4909
stroke                            5110
gender_Female                     5110
gender_Male                       5110
gender_Other                      5110
ever_married_No                   5110
ever_married_Yes                  5110
work_type_Govt_job                5110
work_type_Never_worked            5110
work_type_Private                 5110
work_type_Self-employed           5110
work_type_children                5110
Residence_type_Rural              5110
Residence_type_Urban              5110
smoking_status_Unknown            5110
smoking_status_formerly smoked    5110
smoking_status_never smoked       5110
smoking_status_smokes             5110
dtype: int64

In [20]:
# Drop all rows with missing information
stroke_encoded = stroke_encoded.dropna(how='any')

In [21]:
# Verify dropped rows
stroke_encoded.count()

age                               4909
hypertension                      4909
heart_disease                     4909
avg_glucose_level                 4909
bmi                               4909
stroke                            4909
gender_Female                     4909
gender_Male                       4909
gender_Other                      4909
ever_married_No                   4909
ever_married_Yes                  4909
work_type_Govt_job                4909
work_type_Never_worked            4909
work_type_Private                 4909
work_type_Self-employed           4909
work_type_children                4909
Residence_type_Rural              4909
Residence_type_Urban              4909
smoking_status_Unknown            4909
smoking_status_formerly smoked    4909
smoking_status_never smoked       4909
smoking_status_smokes             4909
dtype: int64

In [22]:
# convert bmi columns to int
stroke_encoded['bmi'] = stroke_encoded['bmi'].astype('int')

In [23]:
# Check the datatypes of your columns.
stroke_encoded.dtypes

age                               int64
hypertension                      int64
heart_disease                     int64
avg_glucose_level                 int32
bmi                               int32
stroke                            int64
gender_Female                     uint8
gender_Male                       uint8
gender_Other                      uint8
ever_married_No                   uint8
ever_married_Yes                  uint8
work_type_Govt_job                uint8
work_type_Never_worked            uint8
work_type_Private                 uint8
work_type_Self-employed           uint8
work_type_children                uint8
Residence_type_Rural              uint8
Residence_type_Urban              uint8
smoking_status_Unknown            uint8
smoking_status_formerly smoked    uint8
smoking_status_never smoked       uint8
smoking_status_smokes             uint8
dtype: object

In [None]:
# save stroke_encoded as csv file.
stroke_encoded.to_csv('resources/clean_stroke_df.csv', index=False)

In [None]:
# Display a statistical overview of the DataFrame
stroke_encoded.describe()