# Cleaning "Stroke Prediction Dataset" (Parquet) with Pandas

## Import Dependencies

In [None]:
%load_ext autotime
import time
start = time.time()
#* BEGINS TIMER ^^

import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

## Load and Read Parquet Dataset into Pandas DataFrame

In [None]:
%%time
#? Load File
Path = "1_parquet_conversion/stroke.parquet.gzip"

#? Read the CSVs into a dataframe
stroke_df = pd.read_parquet(Path)

## Show DataFrame

In [None]:
stroke_df.sample(5)

## Show Data Types

In [None]:
stroke_df.info()

## Convert to the smallest datatype possible for each numeric column

In [None]:
#? FLOATS
# float_cols = stroke_df.select_dtypes(include=['float'])

# for cols in float_cols.columns:
#     stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'float')

#! I commented out the above because it was causing the decimal places to extend and I couldn't use "np.round" to round back to 1 or 2 decimal places

In [None]:
#? INTEGERS
int_cols = stroke_df.select_dtypes(include=['int'])

for cols in int_cols.columns:
    stroke_df[cols] = pd.to_numeric(stroke_df[cols], downcast = 'integer')

## Show Changed Data Types

In [None]:
stroke_df.info()

## Manully Convert 'age', 'avg_glucose_level', & 'bmi' columns to smallest datatype

In [None]:
# INTEGERS
stroke_df['age'] = stroke_df['age'].astype('int8')

# FLOATS
# stroke_df['avg_glucose_level'] = stroke_df['avg_glucose_level'].astype('float32')
# stroke_df['bmi'] = stroke_df['bmi'].astype('float32')

#! It doesn't work this way either!!

In [None]:
stroke_df.info()

## What is the Shape of the DataFrame?

In [None]:
stroke_df.shape

## Find Missing Values

### Count Method

In [None]:
stroke_df.count()

### isnull Method for 'bmi' column

In [None]:
stroke_isnull_df = stroke_df['bmi'].isnull().sum()
stroke_isnull_df

## If I dropped all these rows, what percentage of the data would be lost?

In [None]:
# take the average of the DataFrame isna(){bool}
# Round to 4 numbers
# Multiply by 100 to get %
column_missing_percent = stroke_df['bmi'].isna().mean().round(4) * 100
column_missing_percent
print('------------------------------------------------------')
print(f'The # of missing values in the "bmi" column is {stroke_isnull_df}')
print('--------------------------AND-------------------------')
print(f'{column_missing_percent}% of the column is missing values')

# Is this an acceptable percentage of ROWS to delete?

In [None]:
#! ASK THE GROUP!!! or USE YOUR STATS SKILLS!!!

## Delete Missing Rows?

In [None]:
#stroke_df = stroke_df.dropna(subset=['bmi'])

## Do we want to delete any Columns? (I don't think so)

In [None]:
#! I think we can delete the ID column if we reset the index to be the new ID column and start it at "1"

In [None]:
# stroke_df

## Rename columns before resetting index to new "ID" column

In [None]:
# Rename the index to be the "ID" column before resetting the index
stroke_df.columns = ['ID',
                    'Gender',
                    'Age',
                    'Hypertension',
                    'Heart Disease',
                    'Ever Married',
                    'Work Type',
                    'Residence Type',
                    'Avg Glucose Lvl',
                    'BMI',
                    'Smoker',
                    'Stroke',
                    ]

## Set the index to start at '1' and set as the new 'ID' column

In [None]:
stroke_df['ID'] = stroke_df.index + 1

In [None]:
# Reset Index
stroke_df = stroke_df.set_index('ID')

In [None]:
# stroke_df

# Reorder Columns???

In [None]:
# List column names
stroke_df.columns

In [None]:
# %%time
# new_column_order = ['Gender',
#                     'Age',
#                     'Hypertension',
#                     'Heart Disease',
#                     'Ever Married',
#                     'Work Type',
#                     'Residence Type',
#                     'Avg Glucose Lvl',
#                     'BMI',
#                     'Smoker',
#                     'Stroke',
#                     ]

# stroke_df = stroke_df[new_column_order]

## Change responses for "Work Type" & "Smoker" 

### Smoker

In [None]:
stroke_df['Smoker'].unique()

In [None]:
stroke_df= stroke_df.replace({"Smoker":'formerly smoked'}, "Former")
stroke_df= stroke_df.replace({"Smoker": 'never smoked'}, "Never")
stroke_df= stroke_df.replace({"Smoker":'smokes'}, "Current")          

### Work Type

In [None]:
stroke_df['Work Type'].unique()

In [None]:
stroke_df= stroke_df.replace({"Work Type":'Govt_job'}, "Government")
stroke_df= stroke_df.replace({"Work Type": 'children'}, "Child")
stroke_df= stroke_df.replace({"Work Type":'Self-employed'}, "Self-Employed")
stroke_df= stroke_df.replace({"Work Type":'Never_worked'}, "Never Worked")

# Display Cleaned DataFrame

In [None]:
stroke_df

# Huzzah!

# Export Parquet File to Resources Folder

In [None]:
# Export Parquet File
stroke_df.to_parquet('..\..\Resources\Cleaned_Dataset\clean_stroke.parquet.gzip', compression='gzip', index=False)

# Export CSV File
stroke_df.to_csv('..\..\Resources\Cleaned_Dataset\clean_stroke.csv', index=False)


#* ENDS TIMER
end = time.time()
print(f'{end - start:.2f} seconds')