# Cleaning "Stroke Prediction Dataset" (Parquet) with Pandas

## Import Dependencies

In [1]:
%load_ext autotime
import time
start = time.time()
#* BEGINS TIMER ^^

import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

time: 2.91 s (started: 2022-03-15 23:43:24 -05:00)


## Load and Read Parquet Dataset into Pandas DataFrame

In [2]:
%%time
#? Load File
Path = "1_parquet_conversion/stroke.parquet.gzip"

#? Read the CSVs into a dataframe
stroke_df = pd.read_parquet(Path)

Wall time: 112 ms
time: 109 ms (started: 2022-03-15 23:43:27 -05:00)


## Show DataFrame

In [3]:
stroke_df.sample(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
4577,12668,Male,68.0,0,0,Yes,Self-employed,Urban,195.43,28.9,never smoked,0
4353,35999,Female,52.0,0,0,Yes,Private,Urban,86.85,23.8,formerly smoked,0
2978,2218,Male,42.0,0,0,Yes,Private,Rural,107.83,35.3,smokes,0
3545,71097,Female,23.0,0,0,No,Private,Urban,64.94,18.8,never smoked,0
3735,27176,Female,69.0,0,0,Yes,Private,Rural,103.73,34.7,never smoked,0


time: 32 ms (started: 2022-03-15 23:43:28 -05:00)


## Show Data Types

In [4]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
time: 16 ms (started: 2022-03-15 23:43:28 -05:00)


## Convert to the smallest datatype possible for each numeric column

In [5]:
#? FLOATS
# float_cols = stroke_df.select_dtypes(include=['float'])

# for cols in float_cols.columns:
#     stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'float')

#! I commented out the above because it was causing the decimal places to extend and I couldn't use "np.round" to round back to 1 or 2 decimal places

time: 0 ns (started: 2022-03-15 23:43:28 -05:00)


In [6]:
#? INTEGERS
int_cols = stroke_df.select_dtypes(include=['int'])

for cols in int_cols.columns:
    stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'integer')

time: 15 ms (started: 2022-03-15 23:43:28 -05:00)


## Show Changed Data Types

In [7]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int32  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int8   
 4   heart_disease      5110 non-null   int8   
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int8   
dtypes: float64(3), int32(1), int8(3), object(5)
memory usage: 354.4+ KB
time: 15 ms (started: 2022-03-15 23:43:29 -05:00)


## Manully Convert 'age', 'avg_glucose_level', & 'bmi' columns to smallest datatype

In [8]:
# INTEGERS
stroke_df['age'] = stroke_df['age'].astype('int8')

# FLOATS
# stroke_df['avg_glucose_level'] = stroke_df['avg_glucose_level'].astype('float32')
# stroke_df['bmi'] = stroke_df['bmi'].astype('float32')

#! It doesn't work this way either!!

time: 0 ns (started: 2022-03-15 23:43:29 -05:00)


In [9]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int32  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   int8   
 3   hypertension       5110 non-null   int8   
 4   heart_disease      5110 non-null   int8   
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int8   
dtypes: float64(2), int32(1), int8(4), object(5)
memory usage: 319.5+ KB
time: 31 ms (started: 2022-03-15 23:43:29 -05:00)


## What is the Shape of the DataFrame?

In [10]:
stroke_df.shape

(5110, 12)

time: 0 ns (started: 2022-03-15 23:43:30 -05:00)


## Find Missing Values

### Count Method

In [11]:
stroke_df.count()

id                   5110
gender               5110
age                  5110
hypertension         5110
heart_disease        5110
ever_married         5110
work_type            5110
Residence_type       5110
avg_glucose_level    5110
bmi                  4909
smoking_status       5110
stroke               5110
dtype: int64

time: 16 ms (started: 2022-03-15 23:43:30 -05:00)


### isnull Method for 'bmi' column

In [12]:
stroke_isnull_df = stroke_df['bmi'].isnull().sum()
stroke_isnull_df

201

time: 0 ns (started: 2022-03-15 23:43:30 -05:00)


## If I dropped all these rows, what percentage of the data would be lost?

In [13]:
# take the average of the DataFrame isna(){bool}
# Round to 4 numbers
# Multiply by 100 to get %
column_missing_percent = stroke_df['bmi'].isna().mean().round(4) * 100
column_missing_percent
print('------------------------------------------------------')
print(f'The # of missing values in the "bmi" column is {stroke_isnull_df}')
print('--------------------------AND-------------------------')
print(f'{column_missing_percent}% of the column is missing values')

------------------------------------------------------
The # of missing values in the "bmi" column is 201
--------------------------AND-------------------------
3.93% of the column is missing values
time: 0 ns (started: 2022-03-15 23:43:30 -05:00)


# Is this an acceptable percentage of ROWS to delete?

In [14]:
#! ASK THE GROUP!!! or USE YOUR STATS SKILLS!!!

time: 0 ns (started: 2022-03-15 23:43:30 -05:00)


# We decided to Have two datasets... One with BMI and One without... BMI values are removed right before export

## Delete Missing Rows?

In [15]:
#Delete rows where Gender is "Other"

stroke_df = stroke_df[stroke_df.gender != "Other"]

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


## Do we want to delete any Columns? (I don't think so)

In [16]:
#! I think we can delete the ID column if we reset the index to be the new ID column and start it at "1"

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


In [17]:
# stroke_df

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


## Rename columns before resetting index to new "ID" column

In [18]:
# Rename the index to be the "ID" column before resetting the index
stroke_df.columns = ['ID',
                    'Gender',
                    'Age',
                    'Hypertension',
                    'Heart_Disease',
                    'Ever_Married',
                    'Work_Type',
                    'Residence_Type',
                    'Avg_Glucose_Lvl',
                    'BMI',
                    'Smoker',
                    'Stroke',
                    ]

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


## Set the index to start at '1' and set as the new 'ID' column

In [19]:
stroke_df['ID'] = stroke_df.index + 1

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


In [20]:
# Reset Index
stroke_df = stroke_df.set_index('ID')

time: 0 ns (started: 2022-03-15 23:43:31 -05:00)


In [21]:
stroke_df['ID#'] = stroke_df.index 
stroke_df

Unnamed: 0_level_0,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Avg_Glucose_Lvl,BMI,Smoker,Stroke,ID#
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,1
2,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,2
3,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,3
4,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,4
5,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
5106,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0,5106
5107,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0,5107
5108,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0,5108
5109,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0,5109


time: 47 ms (started: 2022-03-15 23:43:32 -05:00)


# Reorder Columns???

In [22]:
# List column names
stroke_df.columns

Index(['Gender', 'Age', 'Hypertension', 'Heart_Disease', 'Ever_Married',
       'Work_Type', 'Residence_Type', 'Avg_Glucose_Lvl', 'BMI', 'Smoker',
       'Stroke', 'ID#'],
      dtype='object')

time: 0 ns (started: 2022-03-15 23:43:32 -05:00)


In [23]:
new_column_order = ['ID#',
                    'Gender',
                    'Age',
                    'Hypertension',
                    'Heart_Disease',
                    'Ever_Married',
                    'Work_Type',
                    'Residence_Type',
                    'Avg_Glucose_Lvl',
                    'BMI',
                    'Smoker',
                    'Stroke',
                    ]

stroke_df = stroke_df[new_column_order]

time: 16 ms (started: 2022-03-15 23:43:32 -05:00)


## Change responses for "Work Type" & "Smoker" 

### Smoker

In [24]:
stroke_df['Smoker'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

time: 0 ns (started: 2022-03-15 23:43:32 -05:00)


In [25]:
stroke_df= stroke_df.replace({"Smoker":'formerly smoked'}, "Former")
stroke_df= stroke_df.replace({"Smoker": 'never smoked'}, "Never")
stroke_df= stroke_df.replace({"Smoker":'smokes'}, "Current")          

time: 0 ns (started: 2022-03-15 23:43:32 -05:00)


### Work Type

In [26]:
# stroke_df['Work Type'].unique()

time: 0 ns (started: 2022-03-15 23:43:32 -05:00)


In [27]:
stroke_df= stroke_df.replace({"Work Type":'Govt_job'}, "Government")
stroke_df= stroke_df.replace({"Work Type": 'children'}, "Child")
stroke_df= stroke_df.replace({"Work Type":'Self-employed'}, "Self-Employed")
stroke_df= stroke_df.replace({"Work Type":'Never_worked'}, "Never Worked")

time: 0 ns (started: 2022-03-15 23:43:33 -05:00)


# Drop the BMI Rows...

In [28]:
stroke_no_bmi = stroke_df.dropna(subset=['BMI'])

# Export Parquet File
stroke_no_bmi.to_parquet('..\..\Resources\Cleaned_Dataset\Parquet\clean_stroke_bmi_removed.parquet.gzip', compression='gzip', index=False)

# Export CSV File
stroke_no_bmi.to_csv('..\..\Resources\Cleaned_Dataset\clean_stroke_bmi_removed.csv', index=False)

time: 125 ms (started: 2022-03-15 23:43:33 -05:00)


# Display Cleaned DataFrame

In [29]:
stroke_df

Unnamed: 0_level_0,ID#,Gender,Age,Hypertension,Heart_Disease,Ever_Married,Work_Type,Residence_Type,Avg_Glucose_Lvl,BMI,Smoker,Stroke
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,Male,67,0,1,Yes,Private,Urban,228.69,36.6,Former,1
2,2,Female,61,0,0,Yes,Self-employed,Rural,202.21,,Never,1
3,3,Male,80,0,1,Yes,Private,Rural,105.92,32.5,Never,1
4,4,Female,49,0,0,Yes,Private,Urban,171.23,34.4,Current,1
5,5,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,Never,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5106,5106,Female,80,1,0,Yes,Private,Urban,83.75,,Never,0
5107,5107,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,Never,0
5108,5108,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,Never,0
5109,5109,Male,51,0,0,Yes,Private,Rural,166.29,25.6,Former,0


time: 47 ms (started: 2022-03-15 23:43:33 -05:00)


# Huzzah!

# Export Parquet File to Resources Folder

In [30]:
# Export Parquet File
stroke_df.to_parquet('..\..\Resources\Cleaned_Dataset\Parquet\clean_stroke.parquet.gzip', compression='gzip', index=False)

# Export CSV File
stroke_df.to_csv('..\..\Resources\Cleaned_Dataset\clean_stroke.csv', index=False)


#* ENDS TIMER
end = time.time()
print(f'{end - start:.2f} seconds')

9.42 seconds
time: 296 ms (started: 2022-03-15 23:43:33 -05:00)


## Create Bins based on "Age"

In [31]:
# 0 - 20
gen_z_stroke = stroke_df[(stroke_df['Age'] <= 20)]
# 20 - 40
gen_y_stroke = stroke_df[(stroke_df['Age'] > 20) & (stroke_df['Age'] <= 40)]
# 40 - 60
gen_x_stroke = stroke_df[(stroke_df['Age'] > 40) & (stroke_df['Age'] <= 60)]
# 60 - 80
boomer_stroke = stroke_df[(stroke_df['Age'] > 60) & (stroke_df['Age'] <= 80)]
# 80 - 100
greatest_gen_stroke = stroke_df[(stroke_df['Age'] > 80) & (stroke_df['Age'] <= 100)]


time: 15 ms (started: 2022-03-15 23:43:35 -05:00)


In [32]:
# gen_z_stroke
# gen_y_stroke
# gen_x_stroke
# boomer_stroke
# greatest_gen_stroke

time: 0 ns (started: 2022-03-15 23:43:35 -05:00)


In [33]:
# Export Parquet File
gen_z_stroke.to_parquet('..\..\Resources\Age_Datasets\Parquet\gen_Z_stroke.parquet.gzip', compression='gzip', index=False)
gen_y_stroke.to_parquet('..\..\Resources\Age_Datasets\Parquet\gen_Y_stroke.parquet.gzip', compression='gzip', index=False)
gen_x_stroke.to_parquet('..\..\Resources\Age_Datasets\Parquet\gen_X_stroke.parquet.gzip', compression='gzip', index=False)
boomer_stroke.to_parquet('..\..\Resources\Age_Datasets\Parquet\Boomer_stroke.parquet.gzip', compression='gzip', index=False)
greatest_gen_stroke.to_parquet('..\..\Resources\Age_Datasets\Parquet\greatest_gen_stroke.parquet.gzip', compression='gzip', index=False)

# Export CSV File
gen_z_stroke.to_csv('..\..\Resources\Age_Datasets\gen_Z_stroke.csv', index=False)
gen_y_stroke.to_csv('..\..\Resources\Age_Datasets\gen_Y_stroke.csv', index=False)
gen_x_stroke.to_csv('..\..\Resources\Age_Datasets\gen_X_stroke.csv', index=False)
boomer_stroke.to_csv('..\..\Resources\Age_Datasets\Boomer_stroke.csv', index=False)
greatest_gen_stroke.to_csv('..\..\Resources\Age_Datasets\greatest_gen_stroke.csv', index=False)

time: 718 ms (started: 2022-03-15 23:43:36 -05:00)


## Create Bins based on "BMI"

In [34]:
# 0 - 18.5
underweight = stroke_df[(stroke_df['BMI'] < 18.5)]
# 18.5 - 25
healthy_weight = stroke_df[(stroke_df['BMI'] >= 18.5) & (stroke_df['BMI'] < 25)]
# 25 - 30
overweight = stroke_df[(stroke_df['BMI'] >= 25) & (stroke_df['BMI'] < 30)]
# 30 - 35
obese_low_risk = stroke_df[(stroke_df['BMI'] >= 30) & (stroke_df['BMI'] < 35)]
# 35 - 40
obese_medium_risk = stroke_df[(stroke_df['BMI'] >= 35) & (stroke_df['BMI'] < 40)]
# 40 +
obese_high_risk = stroke_df[(stroke_df['BMI'] >= 40) & (stroke_df['BMI'] < 100)]

time: 15 ms (started: 2022-03-15 23:43:37 -05:00)


In [35]:
# Export Parquet File
underweight.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\underweight.parquet.gzip', compression='gzip', index=False)
healthy_weight.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\healthy_weight.parquet.gzip', compression='gzip', index=False)
overweight.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\overweight.parquet.gzip', compression='gzip', index=False)
obese_low_risk.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\obese_low_risk.parquet.gzip', compression='gzip', index=False)
obese_medium_risk.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\obese_medium_risk.parquet.gzip', compression='gzip', index=False)
obese_high_risk.to_parquet(r'..\..\Resources\BMI_Datasets\Parquet\obese_high_risk.parquet.gzip', compression='gzip', index=False)

# Export CSV File
underweight.to_csv(r'..\..\Resources\BMI_Datasets\underweight.csv', index=False)
healthy_weight.to_csv(r'..\..\Resources\BMI_Datasets\healthy_weight.csv', index=False)
overweight.to_csv(r'..\..\Resources\BMI_Datasets\overweight.csv', index=False)
obese_low_risk.to_csv(r'..\..\Resources\BMI_Datasets\obese_low_risk.csv', index=False)
obese_medium_risk.to_csv(r'..\..\Resources\BMI_Datasets\obese_medium_risk.csv', index=False)
obese_high_risk.to_csv(r'..\..\Resources\BMI_Datasets\obese_high_risk.csv', index=False)

time: 781 ms (started: 2022-03-15 23:43:37 -05:00)


## Create bins based on Glucose Levels

In [36]:
# 0 - 100
normal_glucose = stroke_df[(stroke_df['Avg_Glucose_Lvl'] <= 100)]
# 100 - 125
prediabetic_glucose = stroke_df[(stroke_df['Avg_Glucose_Lvl'] > 100) & (stroke_df['Avg_Glucose_Lvl'] <= 125)]
# 125 +
diabetic_glucose = stroke_df[(stroke_df['Avg_Glucose_Lvl'] > 125)]

time: 16 ms (started: 2022-03-15 23:43:38 -05:00)


In [37]:
# Export Parquet File
normal_glucose.to_parquet(r'..\..\Resources\Glucose_Datasets\Parquet\normal_glucose.parquet.gzip', compression='gzip', index=False)
prediabetic_glucose.to_parquet(r'..\..\Resources\Glucose_Datasets\Parquet\prediabetic_glucose.parquet.gzip', compression='gzip', index=False)
diabetic_glucose.to_parquet(r'..\..\Resources\Glucose_Datasets\Parquet\diabetic_glucose.parquet.gzip', compression='gzip', index=False)

# Export CSV File
normal_glucose.to_csv(r'..\..\Resources\Glucose_Datasets\normal_glucose.csv', index=False)
prediabetic_glucose.to_csv(r'..\..\Resources\Glucose_Datasets\prediabetic_glucose.csv', index=False)
diabetic_glucose.to_csv(r'..\..\Resources\Glucose_Datasets\diabetic_glucose.csv', index=False)

time: 359 ms (started: 2022-03-15 23:43:38 -05:00)
