In [507]:
import pandas as pd

In [508]:
stroke_data = pd.read_csv('healthcare-dataset-stroke-data.csv')
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [509]:
##Drop unecessary ID column
column_mapper = 'id'
stroke_data = stroke_data.drop(column_mapper, axis=1)

In [510]:
##Renaming columns: heart_disease, ever_married, work_type, residence_type, avg_glucose_level, smoking_status
col_mapper={'heart_disease':'heart disease', 'ever_married':'ever married', 'work_type':'occupation', 'Residence_type':'residence', 'avg_glucose_level':'avg glucose', 'smoking_status':'smoking status', }
stroke_data = stroke_data.rename(mapper=col_mapper, axis=1)

In [512]:
#Renaming row values so the information is more readable
stroke_data = stroke_data.replace({'hypertension': {1: 'Yes', 0: 'No'}})
stroke_data = stroke_data.replace({'heart disease': {1: 'Yes', 0: 'No'}})
stroke_data = stroke_data.replace({'stroke': {1: 'Yes', 0: 'No'}})

stroke_data

Unnamed: 0,gender,age,hypertension,heart disease,ever married,occupation,residence,avg glucose,bmi,smoking status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61.0,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,Yes,No,Yes,Private,Urban,83.75,,never smoked,No
5106,Female,81.0,No,No,Yes,Self-employed,Urban,125.20,40.0,never smoked,No
5107,Female,35.0,No,No,Yes,Self-employed,Rural,82.99,30.6,never smoked,No
5108,Male,51.0,No,No,Yes,Private,Rural,166.29,25.6,formerly smoked,No


In [513]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          5110 non-null   object 
 1   age             5110 non-null   float64
 2   hypertension    5110 non-null   object 
 3   heart disease   5110 non-null   object 
 4   ever married    5110 non-null   object 
 5   occupation      5110 non-null   object 
 6   residence       5110 non-null   object 
 7   avg glucose     5110 non-null   float64
 8   bmi             4909 non-null   float64
 9   smoking status  5110 non-null   object 
 10  stroke          5110 non-null   object 
dtypes: float64(3), object(8)
memory usage: 439.3+ KB


In [514]:
##Changing data types
stroke_data['gender'] = stroke_data['gender'].astype('category')
stroke_data['hypertension'] = stroke_data['hypertension'].astype('string')
stroke_data['heart disease'] = stroke_data['heart disease'].astype('string')
stroke_data['ever married'] = stroke_data['ever married'].astype('category')
stroke_data['occupation'] = stroke_data['occupation'].astype('category')
stroke_data['residence'] = stroke_data['residence'].astype('category')
stroke_data['smoking status'] = stroke_data['smoking status'].astype('category')
stroke_data['stroke'] = stroke_data['stroke'].astype('string')
##Changeing data type for age and rounding number
stroke_data['age'] = stroke_data['age'].round().astype('int64')

In [515]:
##Checking work in previous step
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   gender          5110 non-null   category
 1   age             5110 non-null   int64   
 2   hypertension    5110 non-null   string  
 3   heart disease   5110 non-null   string  
 4   ever married    5110 non-null   category
 5   occupation      5110 non-null   category
 6   residence       5110 non-null   category
 7   avg glucose     5110 non-null   float64 
 8   bmi             4909 non-null   float64 
 9   smoking status  5110 non-null   category
 10  stroke          5110 non-null   string  
dtypes: category(5), float64(2), int64(1), string(3)
memory usage: 265.4 KB


In [516]:
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart disease,ever married,occupation,residence,avg glucose,bmi,smoking status,stroke
0,Male,67,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


In [517]:
##Grouping data by gender and presence of stroke
has_stroke = stroke_data['stroke'] == 'Yes'

##Male
is_male = stroke_data['gender'] == 'Male'
male_stroke = stroke_data[has_stroke & is_male]

##Female
is_female = stroke_data['gender'] == 'Female'
female_stroke = stroke_data[has_stroke & is_female]

In [518]:
##Calculate the mean age of males who have stroked from dataset
male_stroke_age = male_stroke.agg({'age':['mean']})
print(male_stroke_age)

       age
mean  68.5


In [519]:
##Calculate the mean age of females who have stroked from dataset
female_stroke_age = female_stroke.agg({'age':['mean']})
print(female_stroke_age)

            age
mean  67.134752


In [591]:
#Calculating percentage of data population that has had a stroke
total_stroke_patients = sum(has_stroke)
total_patients = 5110

percentage_of_stroke = round((total_stroke_patients / total_patients) * 100, 1)
percentage_of_stroke

4.9

In [524]:
stroke_data.head()

Unnamed: 0,gender,age,hypertension,heart disease,ever married,occupation,residence,avg glucose,bmi,smoking status,stroke
0,Male,67,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Female,61,No,No,Yes,Self-employed,Rural,202.21,,never smoked,Yes
2,Male,80,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes


In [561]:
#What is the biggest risk factor for having a stroke between HTN and heart disease?
has_htn = stroke_data['hypertension'] == 'Yes'
has_heart_disease = stroke_data['heart disease'] == 'Yes'
has_smoked = (stroke_data['smoking status'] == 'formerly smoked') | (stroke_data['smoking status'] == 'smokes')

htn_stroke = sum(has_stroke & has_htn)
heart_disease_stroke = sum(has_stroke & has_heart_disease)
smoking_stroke = sum(has_stroke & has_smoked)
trifecta_stroke = sum(has_htn & has_heart_disease & has_smoked)

# Variable taken from step above
# total_stroke_patients = sum(has_stroke)

In [566]:
# Percentage of individuals with a stroke and HTN
htn_stroke_percentage = round((htn_stroke / total_stroke_patients) * 100, 1)
htn_stroke_percentage

26.5

In [569]:
# Percentage of individuals with a stroke and Heart Disease
heart_disease_stroke_percentage = round((heart_disease_stroke / total_stroke_patients) * 100, 1)
heart_disease_stroke_percentage

18.9

In [570]:
# Percentage of individuals with a stroke and smoking history
smoking_stroke_percentage = round((smoking_stroke / total_stroke_patients) * 100, 1)
smoking_stroke_percentage

45.0

In [571]:
# Percentage of individuals with all 3: HTN, heart disease, and smoking history
trifecta_stroke_percentage = round((trifecta_stroke / total_stroke_patients) * 100, 1)
trifecta_stroke_percentage

14.5

In [592]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   gender          5110 non-null   category
 1   age             5110 non-null   int64   
 2   hypertension    5110 non-null   string  
 3   heart disease   5110 non-null   string  
 4   ever married    5110 non-null   category
 5   occupation      5110 non-null   category
 6   residence       5110 non-null   category
 7   avg glucose     5110 non-null   float64 
 8   bmi             4909 non-null   float64 
 9   smoking status  5110 non-null   category
 10  stroke          5110 non-null   string  
dtypes: category(5), float64(2), int64(1), string(3)
memory usage: 265.4 KB


In [593]:
stroke_data['residence'].value_counts()

Urban    2596
Rural    2514
Name: residence, dtype: int64

In [598]:
#What is the difference in average stroke rate between individuals over 65 
#who live in urban areas vs those who live in rural areas? 
over_65 = stroke_data['age'] > 65
is_rural = stroke_data['residence'] == 'Rural'
is_urban = stroke_data['residence'] == 'Urban'

rural_65 = sum(has_stroke & over_65 & is_rural)
urban_65 = sum(has_stroke & over_65 & is_urban)

rural_65_percentage = round((rural_65 / total_stroke_patients) * 100, 1)
urban_65_percentage = round((urban_65 / total_stroke_patients) * 100, 1)

In [602]:
#Percentage of stroke patients living in rural areas
rural_65_percentage

28.5

In [601]:
#Percentage of stroke patients living in urban areas
urban_65_percentage

34.1

In [615]:
round((urban_65_percentage - rural_65_percentage), 2)

5.6