# Question 2 Solution

In [None]:
%pip install pandas numpy matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('data/diabetes.csv')
df.head()

## a) Set a seed and take a random sample of 25 observations. Find mean Glucose and highest Glucose values of this sample and compare with population statistics using charts.

In [None]:
# Set seed
np.random.seed(42)

# Sample 25 observations
sample_25 = df.sample(n=25)

# Calculate statistics
sample_mean_glucose = sample_25['Glucose'].mean()
sample_max_glucose = sample_25['Glucose'].max()

pop_mean_glucose = df['Glucose'].mean()
pop_max_glucose = df['Glucose'].max()

print(f"Sample Mean Glucose: {sample_mean_glucose}")
print(f"Population Mean Glucose: {pop_mean_glucose}")
print(f"Sample Max Glucose: {sample_max_glucose}")
print(f"Population Max Glucose: {pop_max_glucose}")

In [None]:
# Charts
stats_data = {
    'Statistic': ['Mean Glucose', 'Mean Glucose', 'Max Glucose', 'Max Glucose'],
    'Value': [sample_mean_glucose, pop_mean_glucose, sample_max_glucose, pop_max_glucose],
    'Group': ['Sample', 'Population', 'Sample', 'Population']
}
stats_df = pd.DataFrame(stats_data)

plt.figure(figsize=(10, 6))
sns.barplot(data=stats_df, x='Statistic', y='Value', hue='Group')
plt.title('Comparison of Glucose Statistics: Sample vs Population')
plt.ylabel('Glucose Value')
plt.show()

## b) Find the 98th percentile of BMI of your sample and the population and compare the results using charts.

In [None]:
# Calculate 98th percentile
sample_98_bmi = np.percentile(sample_25['BMI'], 98)
pop_98_bmi = np.percentile(df['BMI'], 98)

print(f"Sample 98th Percentile BMI: {sample_98_bmi}")
print(f"Population 98th Percentile BMI: {pop_98_bmi}")

# Chart
plt.figure(figsize=(8, 6))
plt.bar(['Sample', 'Population'], [sample_98_bmi, pop_98_bmi], color=['skyblue', 'orange'])
plt.title('98th Percentile of BMI')
plt.ylabel('BMI')
plt.show()

## c) Using bootstrap (replace= True), create 500 samples (of 150 observation each) from the population and find the average mean, standard deviation and percentile for BloodPressure and compare this with these statistics from the population for the same variable.

In [None]:
# Bootstrap
n_samples = 500
sample_size = 150
variable = 'BloodPressure'

bootstrap_means = []
bootstrap_stds = []
bootstrap_percentiles = []

np.random.seed(42)

for _ in range(n_samples):
    # Sample with replacement
    sample = df.sample(n=sample_size, replace=True)
    bootstrap_means.append(sample[variable].mean())
    bootstrap_stds.append(sample[variable].std())
    bootstrap_percentiles.append(np.percentile(sample[variable], 98)) # Using 98th percentile to be consistent with previous parts

bootstrap_means = np.array(bootstrap_means)
bootstrap_stds = np.array(bootstrap_stds)
bootstrap_percentiles = np.array(bootstrap_percentiles)

avg_mean = np.mean(bootstrap_means)
avg_std = np.mean(bootstrap_stds)
avg_percentile = np.mean(bootstrap_percentiles)

pop_mean = df[variable].mean()
pop_std = df[variable].std()
pop_percentile = np.percentile(df[variable], 98)

print(f"Bootstrap Average Mean: {avg_mean:.2f} vs Population Mean: {pop_mean:.2f}")
print(f"Bootstrap Average Std: {avg_std:.2f} vs Population Std: {pop_std:.2f}")
print(f"Bootstrap Average 98th Percentile: {avg_percentile:.2f} vs Population 98th Percentile: {pop_percentile:.2f}")

In [None]:
# Charts
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Mean
axes[0].hist(bootstrap_means, bins=30, alpha=0.7, label='Bootstrap Means')
axes[0].axvline(pop_mean, color='red', linestyle='--', label='Population Mean')
axes[0].set_title('Distribution of Bootstrap Means')
axes[0].legend()

# Std
axes[1].hist(bootstrap_stds, bins=30, color='orange', alpha=0.7, label='Bootstrap Stds')
axes[1].axvline(pop_std, color='red', linestyle='--', label='Population Std')
axes[1].set_title('Distribution of Bootstrap Stds')
axes[1].legend()

# Percentile
axes[2].hist(bootstrap_percentiles, bins=30, color='green', alpha=0.7, label='Bootstrap 98th Percentiles')
axes[2].axvline(pop_percentile, color='red', linestyle='--', label='Population 98th Percentile')
axes[2].set_title('Distribution of Bootstrap 98th Percentiles')
axes[2].legend()

plt.tight_layout()
plt.show()