# Use .describe() to compute a variety of statistics on the whole data set at once.

In [2]:
import pandas as pd

stroke_data = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0



# Filter .describe() to only compute statistics on factors with floating point number values.


In [3]:
stroke_data.describe(include=float)

Unnamed: 0,age,avg_glucose_level,bmi
count,5110.0,5110.0,4909.0
mean,43.226614,106.147677,28.893237
std,22.612647,45.28356,7.854067
min,0.08,55.12,10.3
25%,25.0,77.245,23.5
50%,45.0,91.885,28.1
75%,61.0,114.09,33.1
max,82.0,271.74,97.6



# Use .groupby() to create a data frame grouping by the "stroke" factor.

In [4]:
grouped = stroke_data.groupby("stroke")
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x105946b80>


# Use the "stroke" grouping to get only group where "stroke" is 1.


In [5]:
stroke_group = grouped.get_group(1) # alternately can filter data e.g. stroke_data[stroke_data["stroke"] == 1]
stroke_group

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
244,17739,Male,57.0,0,0,Yes,Private,Rural,84.96,36.7,Unknown,1
245,49669,Female,14.0,0,0,No,children,Rural,57.93,30.9,Unknown,1
246,27153,Female,75.0,0,0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked,1
247,34060,Male,71.0,1,0,Yes,Self-employed,Rural,87.80,,Unknown,1



# Use .describe() to compute statistics on factors with floating point values for the data where "stroke" is 1.


In [6]:
stroke_group.describe(include=float)

Unnamed: 0,age,avg_glucose_level,bmi
count,249.0,249.0,209.0
mean,67.728193,132.544739,30.471292
std,12.727419,61.921056,6.329452
min,1.32,56.11,16.9
25%,59.0,79.79,26.4
50%,71.0,105.22,29.7
75%,78.0,196.71,33.7
max,82.0,271.74,56.6



# Filter .describe() to only compute statistics on factors with integer values, removing as much percentile data as possible.


In [7]:
stroke_group.describe(include=int,percentiles=[])

Unnamed: 0,id,hypertension,heart_disease,stroke
count,249.0,249.0,249.0,249.0
mean,37115.068273,0.26506,0.188755,1.0
std,21993.344872,0.442254,0.392102,0.0
min,210.0,0.0,0.0,1.0
50%,36706.0,0.0,0.0,1.0
max,72918.0,1.0,1.0,1.0



# Create a data frame grouping by both the "hypertension" and "heart_disease" factors.


In [8]:
hyper_heart = stroke_data.groupby(["hypertension", "heart_disease"])
hyper_heart

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x118a430d0>


# Get the group where both "hypertension" and "heart_disease" are 1.


In [9]:
hyper_heart.get_group((1,1))

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
35,712,Female,82.0,1,1,No,Private,Rural,84.03,26.5,formerly smoked,1
115,53401,Male,71.0,1,1,No,Govt_job,Rural,216.94,30.9,never smoked,1
143,37651,Female,69.0,1,1,No,Self-employed,Urban,72.17,36.8,never smoked,1
171,60739,Female,79.0,1,1,No,Self-employed,Rural,60.94,,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4590,19271,Female,82.0,1,1,Yes,Self-employed,Urban,101.56,31.5,never smoked,0
4638,72160,Male,72.0,1,1,Yes,Private,Rural,60.98,34.9,formerly smoked,0
4684,49894,Female,78.0,1,1,Yes,Private,Rural,206.53,,never smoked,0
4784,70497,Female,81.0,1,1,Yes,Private,Rural,126.34,27.4,smokes,0



# Count the number of "id"s per group.


In [25]:
hyper_heart.get_group((1,1)).count().loc["id"]

np.int64(64)


# Aggregate both the mean and standard deviation of "stroke" per group.

In [34]:
stroke_mean = hyper_heart.get_group((1,1))["stroke"].mean()
stroke_std = hyper_heart.get_group((1,1))["stroke"].std()

print(f"The mean is {stroke_mean} and the standard deviation is {stroke_std}.")

The mean is 0.203125 and the standard deviation is 0.4055052697678833.
