#### Note for data fields:
- Age: Age in years when a woman is pregnant.
- SystolicBP: Upper value of Blood Pressure in mmHg, another significant attribute during pregnancy.
- DiastolicBP: Lower value of Blood Pressure in mmHg, another significant attribute during pregnancy.
- BS: Blood glucose levels is in terms of a molar concentration, mmol/L.
- HeartRate: A normal resting heart rate in beats per minute.
- Risk Level: Predicted Risk Intensity Level during pregnancy considering the previous attribute.

 Load library

In [191]:
import pandas as pd

Read the data

In [192]:
df=pd.read_csv('Maternal Health Risk Data Set.csv')

Explore the data set

In [193]:
# rows and columns
df.shape

(1014, 7)

In [194]:
# first 5 lines
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [195]:
# data structure
df.dtypes

Age              int64
SystolicBP       int64
DiastolicBP      int64
BS             float64
BodyTemp       float64
HeartRate        int64
RiskLevel       object
dtype: object

In [196]:
# check missing data
df.isnull().sum()

Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64

In [197]:
# descriptive summary
df.describe()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
count,1014.0,1014.0,1014.0,1014.0,1014.0,1014.0
mean,29.871795,113.198225,76.460552,8.725986,98.665089,74.301775
std,13.474386,18.403913,13.885796,3.293532,1.371384,8.088702
min,10.0,70.0,49.0,6.0,98.0,7.0
25%,19.0,100.0,65.0,6.9,98.0,70.0
50%,26.0,120.0,80.0,7.5,98.0,76.0
75%,39.0,120.0,90.0,8.0,98.0,80.0
max,70.0,160.0,100.0,19.0,103.0,90.0


In [198]:
# convert risk level to categorical variable
df['RiskLevel']=pd.Categorical(df['RiskLevel'],['high risk','mid risk','low risk'])

In [199]:
# count records by risk level
df['RiskLevel'].value_counts()

low risk     406
mid risk     336
high risk    272
Name: RiskLevel, dtype: int64

In [200]:
# Average value of health indicators by risk level
df.groupby('RiskLevel').mean()

Unnamed: 0_level_0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
RiskLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high risk,36.216912,124.194853,85.073529,12.12261,98.899265,76.742647
mid risk,28.363095,113.154762,74.232143,7.795744,98.833333,74.175595
low risk,26.869458,105.866995,72.534483,7.220271,98.368966,72.770936


In [201]:
# return without index of risk level
df.groupby('RiskLevel',as_index=False).mean()

Unnamed: 0,RiskLevel,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,high risk,36.216912,124.194853,85.073529,12.12261,98.899265,76.742647
1,mid risk,28.363095,113.154762,74.232143,7.795744,98.833333,74.175595
2,low risk,26.869458,105.866995,72.534483,7.220271,98.368966,72.770936


In [202]:
# reorder the risk level
# df.groupby('RiskLevel').mean().reindex(index=['high risk','mid risk','low risk'])

In [203]:
# define peak to peak function
def peak_to_peak(df):
    return df.max() - df.min()

In [204]:
# min, max and peak to peak values by risk level
df.groupby('RiskLevel').agg(['min','max',peak_to_peak])

Unnamed: 0_level_0,Age,Age,Age,SystolicBP,SystolicBP,SystolicBP,DiastolicBP,DiastolicBP,DiastolicBP,BS,BS,BS,BodyTemp,BodyTemp,BodyTemp,HeartRate,HeartRate,HeartRate
Unnamed: 0_level_1,min,max,peak_to_peak,min,max,peak_to_peak,min,max,peak_to_peak,min,max,peak_to_peak,min,max,peak_to_peak,min,max,peak_to_peak
RiskLevel,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
high risk,12,65,53,83,160,77,60,100,40,6.1,19.0,12.9,98.0,103.0,5.0,60,90,30
mid risk,10,60,50,70,140,70,50,100,50,6.0,18.0,12.0,98.0,103.0,5.0,60,88,28
low risk,10,70,60,70,129,59,49,100,51,6.0,11.0,5.0,98.0,103.0,5.0,7,88,81


In [206]:
# rename the column of peak_to_peak
df.groupby('RiskLevel').agg(['min','max',('max-min',peak_to_peak)])

Unnamed: 0_level_0,Age,Age,Age,SystolicBP,SystolicBP,SystolicBP,DiastolicBP,DiastolicBP,DiastolicBP,BS,BS,BS,BodyTemp,BodyTemp,BodyTemp,HeartRate,HeartRate,HeartRate
Unnamed: 0_level_1,min,max,max-min,min,max,max-min,min,max,max-min,min,max,max-min,min,max,max-min,min,max,max-min
RiskLevel,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
high risk,12,65,53,83,160,77,60,100,40,6.1,19.0,12.9,98.0,103.0,5.0,60,90,30
mid risk,10,60,50,70,140,70,50,100,50,6.0,18.0,12.0,98.0,103.0,5.0,60,88,28
low risk,10,70,60,70,129,59,49,100,51,6.0,11.0,5.0,98.0,103.0,5.0,7,88,81


In [208]:
# count the patients number by age
df.groupby('Age')[['Age']].count().rename(columns={'Age':'No. of Patients'})

Unnamed: 0_level_0,No. of Patients
Age,Unnamed: 1_level_1
10,4
12,35
13,12
14,3
15,60
16,16
17,63
18,19
19,67
20,22


In [209]:
# Add a new column 'age group'
def age_group(row):
   if row['Age']>=10 and row['Age']<20:
      return '10-20'
   elif row['Age']>=20 and row['Age']<30:
      return '20-30'
   elif row['Age']>=30 and row['Age']<40:
      return '30-40'      
   elif row['Age']>=40 and row['Age']<50:
      return '40-50'   
   else:
      return '50+'

df['Age_Group']=df.apply(age_group,axis=1)

In [210]:
# check the unique values of age group in data frame
df['Age_Group'].unique()

array(['20-30', '30-40', '40-50', '10-20', '50+'], dtype=object)

In [211]:
# group by risk level and age group
df.loc[:,df.columns!='Age'].groupby(['Age_Group','RiskLevel']).mean().unstack()

Unnamed: 0_level_0,SystolicBP,SystolicBP,SystolicBP,DiastolicBP,DiastolicBP,DiastolicBP,BS,BS,BS,BodyTemp,BodyTemp,BodyTemp,HeartRate,HeartRate,HeartRate
RiskLevel,high risk,mid risk,low risk,high risk,mid risk,low risk,high risk,mid risk,low risk,high risk,mid risk,low risk,high risk,mid risk,low risk
Age_Group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
10-20,95.128205,106.039604,98.992806,66.410256,71.50495,66.136691,8.694872,7.19703,7.228129,101.666667,99.376238,98.825899,73.564103,72.326733,72.23741
20-30,122.73913,113.541667,106.156028,83.152174,73.25,74.58156,9.209783,7.238917,7.16539,98.991304,98.666667,98.078014,78.0,76.05,74.06383
30-40,127.089552,117.181818,114.074074,86.567164,72.545455,76.018519,11.271642,6.769091,7.025926,98.597015,98.854545,98.111111,77.58209,74.181818,70.166667
40-50,130.983607,122.105263,118.235294,89.918033,85.263158,83.676471,14.103279,12.684211,7.747059,98.163934,98.0,98.147059,80.065574,79.052632,73.117647
50+,134.237288,120.0,107.210526,92.20339,80.97561,73.421053,15.577966,10.012195,7.2,98.101695,98.341463,98.342105,73.474576,70.97561,73.315789


In [212]:
# look at top 20 records of SystolicBP
def top(df, n=5,column='SystolicBP'):
    return df.sort_values(by=column)[-n:]
top(df,n=20)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel,Age_Group
426,40,140,100,13.0,101.0,66,high risk,40-50
467,50,140,90,15.0,98.0,90,high risk,50+
428,36,140,100,6.8,102.0,76,high risk,30-40
432,40,140,100,13.0,101.0,66,high risk,40-50
436,35,140,80,13.0,98.0,70,high risk,30-40
440,23,140,90,6.8,98.0,70,high risk,20-30
456,23,140,90,6.8,98.0,70,high risk,20-30
422,36,140,100,6.8,102.0,76,high risk,30-40
452,25,140,100,6.8,98.0,80,high risk,20-30
451,48,140,90,15.0,98.0,90,high risk,40-50


In [213]:
# correlation between age and health indicators
age_corr=lambda x: x.corrwith(x['Age'])
df.groupby('RiskLevel').apply(age_corr)

Unnamed: 0_level_0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
RiskLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
high risk,1.0,0.471108,0.448638,0.558726,-0.593412,0.057333
mid risk,1.0,0.325287,0.292474,0.475955,-0.243922,0.012249
low risk,1.0,0.265506,0.261877,0.086769,-0.122487,0.012658


In [214]:
# correlation between SystolicBP and DiastolicBP by RiskLevel
df.groupby('RiskLevel').apply(lambda g: g['SystolicBP'].corr(g['DiastolicBP']))

RiskLevel
high risk    0.859587
mid risk     0.558299
low risk     0.804000
dtype: float64

In [215]:
# Viz - Histograms for each variable

In [216]:
# Viz - scatter plots/boxplots for two or three variables

In [217]:
# Extract the numeric variables (including both float64 and int64)
df.loc[:,(df.dtypes=='float64')|(df.dtypes=='int64')]
# df.loc[:,(df.dtypes!='category')&(df.dtypes!='object')]

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,25,130,80,15.0,98.0,86
1,35,140,90,13.0,98.0,70
2,29,90,70,8.0,100.0,80
3,30,140,85,7.0,98.0,70
4,35,120,60,6.1,98.0,76
...,...,...,...,...,...,...
1009,22,120,60,15.0,98.0,80
1010,55,120,90,18.0,98.0,60
1011,35,85,60,19.0,98.0,86
1012,43,120,90,18.0,98.0,70


In [None]:
# Correlation matrix