# Cleaning "Stroke Prediction Dataset" (Parquet) with Pandas

## Import Dependencies

In [182]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import time

## Load and Read Parquet Dataset into Pandas DataFrame

In [183]:
%%time
#? Load File
Path = "1_parquet_conversion/stroke.parquet.gzip"

#? Read the CSVs into a dataframe
stroke_df = pd.read_parquet(Path)

Wall time: 15 ms


## Show DataFrame

In [184]:
stroke_df.sample(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1151,43268,Female,52.0,1,0,No,Private,Urban,73.0,25.2,smokes,0
1177,4683,Male,23.0,0,0,No,Private,Urban,115.98,22.3,never smoked,0
2747,63144,Male,17.0,0,0,No,Govt_job,Urban,123.04,29.6,never smoked,0
3537,45112,Male,40.0,0,0,No,Govt_job,Urban,197.11,23.9,never smoked,0
626,5353,Male,52.0,0,1,No,Private,Rural,101.5,31.2,smokes,0


## Show Data Types

In [185]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Convert to the smallest datatype possible for each numeric column

In [186]:
# %%time 
#? FLOATS
# float_cols = stroke_df.select_dtypes(include=['float'])

# for cols in float_cols.columns:
#     stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'float')

#! I commented out the above because it was causing the decimal places to extend and I couldn't use "np.round" to round back to 1 or 2 decimal places

In [187]:
%%time
#? INTEGERS
int_cols = stroke_df.select_dtypes(include=['int'])

for cols in int_cols.columns:
    stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'integer')

Wall time: 6.98 ms


## Show Changed Data Types

In [188]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int32  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int8   
 4   heart_disease      5110 non-null   int8   
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int8   
dtypes: float64(3), int32(1), int8(3), object(5)
memory usage: 354.4+ KB


## Manully Convert 'age', 'avg_glucose_level', & 'bmi' columns to smallest datatype

In [189]:
%%time
# INTEGERS
stroke_df['age'] = stroke_df['age'].astype('int8')

# FLOATS
# stroke_df['avg_glucose_level'] = stroke_df['avg_glucose_level'].astype('float32')
# stroke_df['bmi'] = stroke_df['bmi'].astype('float32')

#! It doesn't work this way either!!

Wall time: 2 ms


In [190]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int32  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   int8   
 3   hypertension       5110 non-null   int8   
 4   heart_disease      5110 non-null   int8   
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int8   
dtypes: float64(2), int32(1), int8(4), object(5)
memory usage: 319.5+ KB


## What is the Shape of the DataFrame?

In [191]:
%%time
stroke_df.shape

Wall time: 0 ns


(5110, 12)

## Find Missing Values

### Count Method

In [192]:
%%time
stroke_df.count()

Wall time: 7.98 ms


id                   5110
gender               5110
age                  5110
hypertension         5110
heart_disease        5110
ever_married         5110
work_type            5110
Residence_type       5110
avg_glucose_level    5110
bmi                  4909
smoking_status       5110
stroke               5110
dtype: int64

### isnull Method for 'bmi' column

In [193]:
%%time
stroke_isnull_df = stroke_df['bmi'].isnull().sum()
stroke_isnull_df

Wall time: 996 µs


201

## If I dropped all these rows, what percentage of the data would be lost?

In [194]:
%%time
# take the average of the DataFrame isna(){bool}
# Round to 4 numbers
# Multiply by 100 to get %
column_missing_percent = stroke_df['bmi'].isna().mean().round(4) * 100
column_missing_percent
print('------------------------------------------------------')
print(f'The # of missing values in the "bmi" column is {stroke_isnull_df}')
print('--------------------------AND-------------------------')
print(f'{column_missing_percent}% of the column is missing values')

------------------------------------------------------
The # of missing values in the "bmi" column is 201
--------------------------AND-------------------------
3.93% of the column is missing values
Wall time: 1.99 ms


# Is this an acceptable percentage of ROWS to delete?

In [195]:
%%time
#! ASK THE GROUP!!! or USE YOUR STATS SKILLS!!!

Wall time: 0 ns


## Delete Missing Rows?

In [196]:
#stroke_df = stroke_df.dropna(subset=['bmi'])

## Do we want to delete any Columns? (I don't think so)

In [197]:
%% time
#! I think we can delete the ID column if we reset the index to be the new ID column and start it at "1"

UsageError: Cell magic `%%` not found.


In [198]:
stroke_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


## Rename columns before resetting index to new "ID" column

In [199]:
%%time
# Rename the index to be the "ID" column before resetting the index
stroke_df.columns = ['ID',
                    'Gender',
                    'Age',
                    'Hypertension',
                    'Heart Disease',
                    'Ever Married',
                    'Work Type',
                    'Residence Type',
                    'Avg Glucose Lvl',
                    'BMI',
                    'Smoking Status',
                    'Stroke',
                    ]

Wall time: 0 ns


## Set the index to start at '1' and set as the new 'ID' column

In [200]:
%%time
stroke_df['ID'] = stroke_df.index + 1

Wall time: 998 µs


In [201]:
%%time
# Reset Index
stroke_df = stroke_df.set_index('ID')

Wall time: 2.99 ms


In [202]:
stroke_df

Unnamed: 0_level_0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Avg Glucose Lvl,BMI,Smoking Status,Stroke
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
3,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
4,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
5,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5106,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0
5107,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5108,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5109,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Reorder Columns???

In [203]:
%%time
# List column names
stroke_df.columns

Wall time: 0 ns


Index(['Gender', 'Age', 'Hypertension', 'Heart Disease', 'Ever Married',
       'Work Type', 'Residence Type', 'Avg Glucose Lvl', 'BMI',
       'Smoking Status', 'Stroke'],
      dtype='object')

In [204]:
# %%time
# new_column_order = ['Gender',
#                     'Age',
#                     'Hypertension',
#                     'Heart Disease',
#                     'Ever Married',
#                     'Work Type',
#                     'Residence Type',
#                     'Avg Glucose Lvl',
#                     'BMI',
#                     'Smoking Status',
#                     'Stroke',
#                     ]

# stroke_df = stroke_df[new_column_order]