# Acquire

In [1]:
# imports
import pandas as pd
import numpy as np

import opendatasets as od
import os

from imblearn.over_sampling import SMOTE

In [2]:
od.download('https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?select=healthcare-dataset-stroke-data.csv')

Skipping, found downloaded files in "./stroke-prediction-dataset" (use force=True to force download)


In [3]:
# gettind data
df = pd.read_csv('stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [4]:
# getting the head
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
# function
def get_data_csv():
    '''
    Argument: No arguments required
    Note: kaggle api username and key are required to download the data
    Actions: 
        1. Checks for the existence of the csv
            a. if present:
                i. reads the csv from the current working directory
            b. if not present:
                i. downloads csv from kaggle api
    Return: dataframe
    Modules:
        1. import pandas as pd
        2. import os
        3. import opendatasets as od
    '''
    # a variable to hold the xpected or future file name
    filename = 'stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'
    
    # if the file is present in the directory 
    if os.path.isfile(filename):
      
        # read the csv and assign it to the variable df
        df = pd.read_csv(filename)
        
        # return the dataframe and exit the funtion
        return df
    
    # if the file is not in the current working directory,
    else:
        
        # url needed to read from a csv
        url = 'https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset?select=healthcare-dataset-stroke-data.csv'
        
        # downloads the csv from kaggle after api key is entered
        od.download(url)
        
        # reads csv from url using pandas function
        df = pd.read_csv(filename)
        
        # returns the dataframe
        return df

#### Test the function

In [6]:
from wrangle import get_data_csv

In [7]:
df = get_data_csv()

# Prepare

In [8]:
from prepare_module import summarize

In [9]:
df.shape

(5110, 12)

In [10]:
# doing the initial inspection

summarize(df)

                    SUMMARY REPORT


Dataframe head: 


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1




Dataframe info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB




Dataframe Description: 


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5110.0,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
age,5110.0,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,4909.0,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6
stroke,5110.0,0.048728,0.21532,0.0,0.0,0.0,0.0,1.0


DataFrame value counts: 


Unnamed: 0,id
"(-5.8740000000000006, 7354.3]",539
"(7354.3, 14641.6]",518
"(14641.6, 21928.9]",472
"(21928.9, 29216.2]",501
"(29216.2, 36503.5]",487
"(36503.5, 43790.8]",516
"(43790.8, 51078.1]",535
"(51078.1, 58365.4]",535
"(58365.4, 65652.7]",495
"(65652.7, 72940.0]",512


Unnamed: 0,gender
Female,2994
Male,2115
Other,1


Unnamed: 0,age
"(-0.00292, 8.272]",434
"(8.272, 16.464]",362
"(16.464, 24.656]",440
"(24.656, 32.848]",484
"(32.848, 41.04]",597
"(41.04, 49.232]",583
"(49.232, 57.424]",686
"(57.424, 65.616]",559
"(65.616, 73.808]",407
"(73.808, 82.0]",558


Unnamed: 0,hypertension
"(-0.002, 0.1]",4612
"(0.1, 0.2]",0
"(0.2, 0.3]",0
"(0.3, 0.4]",0
"(0.4, 0.5]",0
"(0.5, 0.6]",0
"(0.6, 0.7]",0
"(0.7, 0.8]",0
"(0.8, 0.9]",0
"(0.9, 1.0]",498


Unnamed: 0,heart_disease
"(-0.002, 0.1]",4834
"(0.1, 0.2]",0
"(0.2, 0.3]",0
"(0.3, 0.4]",0
"(0.4, 0.5]",0
"(0.5, 0.6]",0
"(0.6, 0.7]",0
"(0.7, 0.8]",0
"(0.8, 0.9]",0
"(0.9, 1.0]",276


Unnamed: 0,ever_married
Yes,3353
No,1757


Unnamed: 0,work_type
Private,2925
Self-employed,819
children,687
Govt_job,657
Never_worked,22


Unnamed: 0,Residence_type
Urban,2596
Rural,2514


Unnamed: 0,avg_glucose_level
"(54.902, 76.782]",1250
"(76.782, 98.444]",1790
"(98.444, 120.106]",956
"(120.106, 141.768]",310
"(141.768, 163.43]",154
"(163.43, 185.092]",91
"(185.092, 206.754]",209
"(206.754, 228.416]",217
"(228.416, 250.078]",108
"(250.078, 271.74]",25


Unnamed: 0,bmi
"(10.212, 19.03]",411
"(19.03, 27.76]",1964
"(27.76, 36.49]",1809
"(36.49, 45.22]",575
"(45.22, 53.95]",105
"(53.95, 62.68]",37
"(62.68, 71.41]",4
"(71.41, 80.14]",2
"(80.14, 88.87]",0
"(88.87, 97.6]",2


Unnamed: 0,smoking_status
never smoked,1892
Unknown,1544
formerly smoked,885
smokes,789


Unnamed: 0,stroke
"(-0.002, 0.1]",4861
"(0.1, 0.2]",0
"(0.2, 0.3]",0
"(0.3, 0.4]",0
"(0.4, 0.5]",0
"(0.5, 0.6]",0
"(0.6, 0.7]",0
"(0.7, 0.8]",0
"(0.8, 0.9]",0
"(0.9, 1.0]",249


nulls in dataframe by column: 


Unnamed: 0,num_rows_missing,percent_rows_missing
bmi,201,3.933464
id,0,0.0
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0


nulls in dataframe by row: 


Unnamed: 0,num_cols_missing,percent_cols_missing
680,1,8.333333
183,1,8.333333
1235,1,8.333333
170,1,8.333333
171,1,8.333333
...,...,...
1754,0,0.000000
1752,0,0.000000
1751,0,0.000000
1750,0,0.000000




#### Takeaways:
> * **bmi** has roughly 200 null values, or 3% 
    * look into binning as well based on doctor reccommendations (bmi is problematic though)
> * **age** is a float - inspect the values there
    * Age minimum is 0.08?? We will need to check that value
    * Bin ages
> * Gender has more female than male, 1 other
> * Hypertension: most dont have
> * Heart disease: most dont have
> * Most are married
> * work_type: majority private
> * **Resdience_type** change capital, good split
> * **avg_glucose** - look into whats considered healthy or not - look into binning
    > 
> * **smoking_status** - many unknowns, we will have to look into these values as missing most likely -  convert unknowns to nan - 30% of smoking status is unknown/null
    * Reasons for unkowns: people dont like to look bad on surveys
    * social smoker was not an option
    * 

#### Actions:
> * Change Residence_type to residence_type
> * Look into the age category
> * Look into bmi categories 
> * Look into glucose categories
> * Drop smoking_status - 30% of the data is missing, during the mvp we can drop this. in later iterations, we can run through the know smoking status users and also run through the unknown smpking status users process

In [11]:
# 16 and under is considered children
df[df.work_type == 'children'].age.max()

16.0

In [12]:
(df == 'Unknown').sum()

id                      0
gender                  0
age                     0
hypertension            0
heart_disease           0
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                     0
smoking_status       1544
stroke                  0
dtype: int64

In [13]:
    num_missing = (df == 'Unknown').sum()
    
    # assigne the number of rows to a variable
    rows = df.shape[0]
    
    # calculates the percentage of the column that's missing
    percent_missing = num_missing / rows * 100
    
    # creates a dataframe using the actual number ofmissing values and the percetage of the column that is missing
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'percent_rows_missing': percent_missing})
    
    # returns the dataframe with the largest numbers first
    cols_missing.sort_values(by='num_rows_missing', ascending=False)

Unnamed: 0,num_rows_missing,percent_rows_missing
smoking_status,1544,30.215264
id,0,0.0
gender,0,0.0
age,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
work_type,0,0.0
Residence_type,0,0.0
avg_glucose_level,0,0.0


In [14]:
# roughly half of the people who have unknown smoking statuses are under 16 the other half are 
df[df.smoking_status == 'Unknown'].age.value_counts(bins=5)

(-0.00292, 16.464]    657
(32.848, 49.232]      246
(49.232, 65.616]      231
(16.464, 32.848]      230
(65.616, 82.0]        180
Name: age, dtype: int64

#### Future iteration:
> * Look into separating the dataset - working age vs not working age (16 and under/over 16)

In [15]:
## FUNCTION
# creating lower case column names
df.columns = df.columns.str.lower()

In [16]:
## FUNCTION
# droping smoking status
df.drop(['smoking_status', 'id'], axis=1, inplace=True)

In [17]:
## FUNCTION
# adding age bins
df['age_bins'] = pd.cut(df['age'], bins= [0, 9, 19, 29, 39, 49, 59, 69, 79, 89], 
      labels = ['under 10', 'teens', '20s', '30s', '40s', '50s', '60s', '70s', '80s']
      )

In [18]:
# FUNCTION
# add bmi bins
df['bmi_bins'] = pd.cut(df['bmi'], bins= 5)

https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html
https://onlinelibrary.wiley.com/doi/abs/10.1046/j.1467-789x.2001.00031.x

In [19]:
df.avg_glucose_level.describe()

count    5110.000000
mean      106.147677
std        45.283560
min        55.120000
25%        77.245000
50%        91.885000
75%       114.090000
max       271.740000
Name: avg_glucose_level, dtype: float64

In [20]:
## FUNCTION
df['glucose_bins'] = pd.cut(df['avg_glucose_level'], bins = [0, 70, 125, 300], labels=['low', 'average', 'high'])

https://my.clevelandclinic.org/health/diagnostics/12363-blood-glucose-test

In [21]:
## FUNCTION
df.dropna(inplace=True)

In [22]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,stroke,age_bins,bmi_bins,glucose_bins
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,1,60s,"(27.76, 45.22]",high
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,1,80s,"(27.76, 45.22]",average
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,1,40s,"(27.76, 45.22]",high
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,1,70s,"(10.213, 27.76]",high
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,1,80s,"(27.76, 45.22]",high


In [1]:
def clean_data():
    '''
    Arguments: none
    Action:
        1. Gets data
        2. Changes colmn names
        3. Drops 2 columns
        4. Drops nulls
    Returns: Clean df
    Modules:
        1. import pandas as pd
        2. from wrangle import get_data_csv
    '''
    # get data
    df = get_data_csv()
    
    # changing column names
    df.columns = df.columns.str.lower().str.strip()
    
    # dropping unneeded columns
    df.drop(['id'], axis=1, inplace=True)
    
    # dropping null values
    df.dropna(inplace=True)
    
    # exit df with returned values
    return df

In [24]:
def prepare_data(df, base_explore=True):
    '''
    Arguments: cleaned df, base_explore retains variables in a non-encoded format, useful for visualizations and exploration
    Actions:
        1. Creates a dataframe with only dummy variables, numerical variables, and the target
        2. Formats all the column titles for python usability
        3. Splits data into train validate, and test with straitification on target
    Return: train, validate, test
    Modules: pandas as pd
    '''
    
    # creating age bins based on decade
    df['age_bins'] = pd.cut(df['age'], bins= [0, 9, 19, 29, 39, 49, 59, 69, 79, 89], 
      labels = ['under 10', 'teens', '20s', '30s', '40s', '50s', '60s', '70s', '80s']
      )
    
    # creating glucose bins based on medical reccommendation
    df['glucose_bins'] = pd.cut(df['avg_glucose_level'], bins = [0, 70, 125, 300], labels=['low', 'average', 'high'])
    
    # creating bmi bins 
    df['bmi_bins'] = pd.cut(df['bmi'], bins= 5)
    
    # assigning a target
    target = 'stroke'
    
    # default argument fo base_explore is True
    if base_explore == True:

        # skip the encoding of the variables
        pass
    
    else:
        # Create list of object type/categorical columns
        df_objects = [col for col in df if df[col].dtype == 'O' and col != target]
        
        # Create dummy variables and add them to the df
        df = pd.concat([df, pd.get_dummies(df[df_objects], drop_first=True)], axis=1)
    
        # Create a list of all non-object variables and including the target
        num_cols = [col for col in df if df[col].dtype != 'O' or col == target]

        # creating a df with only the variables needed for exploring and modeling
        df = df[num_cols]
   
    return df

In [25]:
prepare_data(clean_data())

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,stroke,age_bins,glucose_bins,bmi_bins
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,1,60s,high,"(27.76, 45.22]"
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,1,80s,average,"(27.76, 45.22]"
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,1,40s,high,"(27.76, 45.22]"
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,1,70s,high,"(10.213, 27.76]"
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,1,80s,high,"(27.76, 45.22]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,0,teens,average,"(10.213, 27.76]"
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,0,80s,high,"(27.76, 45.22]"
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,0,30s,average,"(27.76, 45.22]"
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,0,50s,high,"(10.213, 27.76]"


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
def split_data(df, stratify_on='stroke'):
    '''
    Arguments: prepared dataframe, optional target - must be a string literal that is a column title
    Actions: 
        1. Splits the dataframe with 80% of the data assigned to tv and 20% assigned to test
        2. Splits the tv dataset with 70% of tv assigned to train and 30% assigned to validate
    Returns: 3 variables, each containing a portion
    Modules: 
        1. from sklearn.model_selection import train_test_split
        2. pandas as pd
    Note: Order matters with variable assignment
    '''
    
    # when the target is a string that is a column title
    if stratify_on in df.columns.to_list():
        # the data is split 80/20 with the target used for stratification
        train_validate, test = train_test_split(df, train_size=.8, random_state = 1017,
                stratify = df[stratify_on])
        
         # splitting train_validate 70/30 with the target used for stratification
        train, validate = train_test_split(train_validate, train_size=.7, stratify=train_validate[stratify_on])
    # for all other targets
    else:
        # inform user that there is no stratification
        print('No stratification applied during the split')
        
        # split that data 80/20
        train_validate, test = train_test_split(df, train_size=.8, random_state = 1017)
        
        # splitting train_validate 70/30
        train, validate = train_test_split(train_validate, train_size=.7)
    
    return train, validate, test

In [28]:
train, validate, test = split_data(df)

In [29]:
def wrangle_data():
    '''
    Arguments: none
    Actions: uses all other modules created to wrangle data in one function
    Returns: train, validate, test
    '''
    # split the data and assign it to variables
    train, validate, test = split_data(
        
        # prep the data
        prepare_data(
            
            # get clean data
            clean_data()))
    
    # exit function and return train, validate, and test ds
    return train, validate, test

In [30]:
train, validate, test = wrangle_data()

In [31]:
train.shape, validate.shape, test.shape

((2748, 13), (1179, 13), (982, 13))

# balance

In [33]:
X = train.drop('stroke', axis=1)
y = train['stroke']

In [57]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

In [44]:
# counting stroke results
counter = Counter(y)
counter

Counter({0: 2631, 1: 117})

In [45]:
counter.items()

dict_items([(0, 2631), (1, 117)])

In [60]:
for label, _ in counter.items():
    row_ix = np.where(y == label)[0]
    plt.scatter(X[row_ix, 0], X[row_is, 1], label=str(label))
plt.legend()
plt.show()

InvalidIndexError: (array([   0,    1,    3, ..., 2745, 2746, 2747]), 0)

In [32]:
oversample = SMOTE()
