In [None]:
# python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import seaborn as sns

In [None]:
#Importing data into python 
df = pd.read_csv(r'D:\Downloads\diabetes.csv')
df.head(n = 10) 

In [None]:
df.shape

This dataset contains 768 measurements and 9 features. 

In [None]:
df['Age'].mean() 

In [None]:
df['Outcome'].value_counts()

In [None]:
sns.pairplot(df)

Pairpot analysis: Histograms for glucose, blood pressure, skin thickness, 
insulin, and BMI seem to be bimodal distribution 

In [None]:
df.isnull().any()

In [None]:
df["Glucose"] = df["Glucose"].replace(0, np.NaN)
df["BloodPressure"] = df["BloodPressure"].replace(0, np.NaN)
df["SkinThickness"] = df["SkinThickness"].replace(0, np.NaN)
df["Insulin"] = df["Insulin"].replace(0, np.NaN)
df["BMI"] = df["BMI"].replace(0, np.NaN)

In [None]:
sns.pairplot(df)

Executing the first line of given code replaced any zero values with null. Doing this basically ignored the data point while calculating mean or median. Hence, we can see a normal distribution now. 

In [None]:
df.isnull().any()

In [None]:
df.isna().sum() 

Executing second line of code tells us that there are null values in glucose, blood pressure, skin thickness, insulin, and BMI which need to be relpaced. 

In [None]:
df['Glucose'].fillna(df['Glucose'].median(), inplace = True)
df['BloodPressure'].fillna(df['BloodPressure'].median(),inplace=True)
df['SkinThickness'].fillna(df['SkinThickness'].median(),inplace=True)
df['Insulin'].fillna(df['Insulin'].median(),inplace=True)
df['BMI'].fillna(df['BMI'].median(),inplace=True)  

Executing the third line of code replaced all null values with the median of the individual feature. 

In my opinion replacing null values with either mean or median is valid in this case because we have low number of measurements. However, if we had a large number of measurements, we could have basically dropped those null measurements. 

In [None]:
df.corr()

Pregnancy has highest correlation with age (0.544341). 

Glucose has highest correlation with outcome (0.492782).

In [None]:
df.groupby(['Outcome']).mean()

Each features has lower average value with outcome 0 (no diabetes) compared to Outcome 1 (diabetes).

In [None]:
sns.pairplot(df,hue = 'Outcome')

Part 2 – Hypothesis Generation and Testing

In [None]:
stats.probplot(df['BloodPressure'], plot = plt)

In [None]:
#normality test
#H0: Blood pressure is normally distributed
#H1: Blood pressure isn't normally distributed
# alpha = 0.01

stats.normaltest(df['BloodPressure'])

Since, p-value is smaller than alpha, we conculde H1 i.e., Blood pressure isn't normally distributed. 

In [None]:
df['BloodPressure'].mean() 

In [None]:
np.percentile(df['BloodPressure'],[2.5,97.5]) 

95% confidence interval means we are 95% certain that the mean falls within this range. 

In [None]:
np.percentile(df['BloodPressure'],[0.5,99.5]) 

99% confidence interval means we are 99% certain that the mean falls within this range.

Yes, the mean falls within both of the confidence interval 95% and 99%. 

In [None]:
df.head()

In [None]:
#3(a)

#H0: The blood pressure of people without diabetes is not significantly different than the mean BP of population
#H1: The blood pressure of people without diabetes is significantly different than the mean BP of population
# alpha = 0.01

stats.ttest_1samp(df[df['Outcome'] == 0]['BloodPressure'], popmean = 71)

Since p-value is higher than alpha, we fail to reject H0. i.e. we conclude the blood pressure of people without diabetes is not significantly different than the mean BP of population. 

In [None]:
#3(b)

#H0: The blood pressure of people with diabetes is not significantly different than the mean BP of population
#H1: The blood pressure of people with diabetes is significantly different than the mean BP of population
# alpha = 0.01

stats.ttest_1samp(df[df['Outcome'] == 1]['BloodPressure'], popmean = 71)

Since p-value is less than alpha, we reject H0. i.e. we conclude the blood pressure of people with diabetes is significantly different than the mean BP of population. 

In [None]:
#3(c)

#H0: mean blood pressure of people with and without diabetes in this sample is not significantly different. 
#H1: mean blood pressure of people with and without diabetes in this sample is significantly different. 
# alpha: 0.01 

stats.ttest_ind(df[df['Outcome'] == 1]['BloodPressure'], df[df['Outcome'] == 0]['BloodPressure'])

Since p-value is less than alpha, we reject H0. i.e. we conclude the mean blood pressure of people with and without diabetes in this sample is significantly different.

In [None]:
# 4. 

#H0: The glucose level of people without diabetes is not significantly different than the mean glucose level of population
#H1: The glucose level of people without diabetes is significantly different than the mean glucose level of population
# alpha = 0.01

stats.ttest_1samp(df[df['Outcome'] == 0]['Glucose'], popmean = 110)

Since p-value is higher than alpha, we fail to reject H0. i.e. we conclude the glucose level of people without diabetes is not significantly different than the mean glucose level of population.

In [None]:
#H0: The glucose level of people with diabetes is not significantly different than the mean glucose level of population
#H1: The glucose level of people with diabetes is significantly different than the mean glucose level of population
# alpha = 0.01

stats.ttest_1samp(df[df['Outcome'] == 1]['Glucose'], popmean = 110)

Since p-value is less than alpha, we reject H0. i.e. we conclude the glucose level of people with diabetes is significantly different than the mean glucose level of population.

In [None]:
#H0: mean glucose level of people with and without diabetes in this sample is not significantly different. 
#H1: mean glucose level of people with and without diabetes in this sample is significantly different. 
# alpha: 0.01 

stats.ttest_ind(df[df['Outcome'] == 1]['Glucose'], df[df['Outcome'] == 0]['Glucose'])

Since p-value is less than alpha, we reject H0. i.e. we conclude the mean glucose level of people with and without diabetes in this sample is significantly different.

In [None]:
# 5. 
# I chose insulin 

#H0: mean insulin level of people with and without diabetes in this sample is not significantly different. 
#H1: mean insulin level of people with and without diabetes in this sample is significantly different. 
# alpha: 0.01 

stats.ttest_ind(df[df['Outcome'] == 1]['Insulin'], df[df['Outcome'] == 0]['Insulin'])

Since p-value is less than alpha, we reject H0. i.e. we conclude the mean insulin level of people with and without diabetes in this sample is significantly different.