In [1]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

## Read the CSV and Perform Basic Data Cleaning

In [2]:
# Create a DataFrame for the healthcare-dataset-stroke-data.csv. 
stroke_df = pd.read_csv('../resources/healthcare-dataset-stroke-data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
#  Get a list of all columns within the DataFrame.
stroke_df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
# Identify incomplete rows
stroke_df.count()

id                   5110
gender               5110
age                  5110
hypertension         5110
heart_disease        5110
ever_married         5110
work_type            5110
Residence_type       5110
avg_glucose_level    5110
bmi                  4909
smoking_status       5110
stroke               5110
dtype: int64

## Remove all Null Values

In [5]:
# Drop all rows with missing information
stroke_df = stroke_df.dropna(how='any')

In [6]:
# Verify dropped rows
stroke_df.count()

id                   4909
gender               4909
age                  4909
hypertension         4909
heart_disease        4909
ever_married         4909
work_type            4909
Residence_type       4909
avg_glucose_level    4909
bmi                  4909
smoking_status       4909
stroke               4909
dtype: int64

In [7]:
# Check the datatypes of your columns.
stroke_df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [8]:
# Identify all the possible values in the gender column
stroke_df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

## convert age, avg_glucose_level, and bmi columns to int


In [9]:
# convert age columns to int
stroke_df['age'] = stroke_df['age'].astype('int')

# Check the datatypes of your columns.
stroke_df.dtypes

id                     int64
gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [10]:
# convert avg_glucose_level columns to int

stroke_df['avg_glucose_level'] = stroke_df['avg_glucose_level'].astype('int')

In [11]:
# Check the datatypes of your columns.
stroke_df.dtypes

id                     int64
gender                object
age                    int64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level      int64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [12]:
# convert bmi columns to int
stroke_df['bmi'] = stroke_df['bmi'].astype('int')


In [13]:
# Check the datatypes of your columns.
stroke_df.dtypes

id                    int64
gender               object
age                   int64
hypertension          int64
heart_disease         int64
ever_married         object
work_type            object
Residence_type       object
avg_glucose_level     int64
bmi                   int64
smoking_status       object
stroke                int64
dtype: object

In [14]:
# save stroke_df as csv file.
stroke_df.to_csv('../resources/clean_strings_stroke_df.csv', index=False)
