In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import math
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/heights-and-weights-dataset/SOCR-HeightWeight.csv')

In [None]:
df.columns = ['index','height','weight']

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#lets check distribution of height and weight
sns.distplot(df['height'])

<b>It is a complete normal distribution</b><br>
Now lets check distribution of weights

In [None]:
sns.distplot(df['weight'])

<b>Weight is also normally distributed as expected</b>

# Central Limit Theorem
<b>CLT states that the sample mean of sampling distribution is approx. equal to population mean</b>
<br>OR<br>
The central limit theorem states that if you have a population with mean μ and standard deviation σ and take sufficiently large random samples from the population with replacementtext annotation indicator, then the distribution of the sample means will be approximately normally distributed. This will hold true regardless of whether the source population is normal or skewed, provided the sample size is sufficiently large (usually n > 30). If the population is normal, then the theorem holds true even for samples smaller than 30. In fact, this also holds true even if the population is binomial, provided that min(np, n(1-p))> 5, where n is the sample size and p is the probability of success in the population. This means that we can use the normal probability model to quantify uncertainty when making inferences about a population mean based on the sample mean.
<br><br><br>
Assuming this dataset as population dataset.
Lets take out a random sample and try to prove CLT

In [None]:
print("POPULATION MEAN(Height)= "+str(df['height'].mean()))
print("POPULATION MEAN(Weight)= "+str(df['weight'].mean()))

In [None]:
sampledf = df.sample(frac=0.3) #30% of the data is taken as random sample

In [None]:
#sample size for sampling dist.
size = 20
no_of_samples = 50
lst_mean_heights,lst_mean_weights = [],[]
#creating sampling distribution for sample size as mentioned above
for i in range(no_of_samples):
    tempdf = sampledf.sample(size)
    lst_mean_heights.append(tempdf['height'].mean())
    lst_mean_weights.append(tempdf['weight'].mean())
    
    

In [None]:
#sampling dist. of heights
sns.distplot(lst_mean_heights)

In [None]:
#sampling dist of weights
sns.distplot(lst_mean_weights)


<b>We can see that the above distributions are appr. normal and the sample means are also approx. equal to population means</b>
<br>
Now we can increase sample size and plot the distribution again

In [None]:
#sample size for sampling dist.
size = 40
no_of_samples = 500
#increasing sample size to 40 and no_of_samples as 500 
lst_mean_heights,lst_mean_weights = [],[]
#creating sampling distribution for sample size as mentioned above
for i in range(no_of_samples):
    tempdf = sampledf.sample(size)
    lst_mean_heights.append(tempdf['height'].mean())
    lst_mean_weights.append(tempdf['weight'].mean())
    
    

In [None]:
#sampling dist. of heights
sns.distplot(lst_mean_heights)

In [None]:
#sampling dist. of weights
sns.distplot(lst_mean_weights)

#### It is very clear that we get a more better bell shape than the earlier example.

In [None]:
from statistics import mean,pstdev
#now lets check the sampling distribution means and mean of the population
print("POPULATION MEAN (Height)= "+str(df['height'].mean()))
print("POPULATION MEAN (Weight)= "+str(df['weight'].mean()))
print("SAMPLING DIST MEAN(Height)= "+str(mean(lst_mean_heights)))
print("SAMPLING DIST MEAN(Weight)= "+str(mean(lst_mean_weights)))

## Pretty Close, Right!

# Confidence Intervals

Lets check mean with Confidence level of 95% and 99%

In [None]:
#lets create another sample but a smaller one out of the provided population
sampledf = df.sample(50)

In [None]:
#check the means of this sample
print("SAMPLE MEAN (Height)= "+str(sampledf['height'].mean()))
print("SAMPLE MEAN (Weight)= "+str(sampledf['weight'].mean()))

<b>Clearly it is not that close to the actual mean, now we will infer population mean from the sample mean assuming we don't know the population means already</b>
<br><br>
for 95% confidence level, alpha is 5%
for 99% confidence level, alpha is 1%


#### calculating standard error, with assuming population variance is equal to sample variance which should be the case for this distribution. We will follow z-statistics for calculating confidence intervals

In [None]:
#z-stats for 95% confidence is 1.96 and for 99% is 2.58
# confidence interval - [sample_mean - z_stat*stnd_error , sample_mean + z_stats*stnd_error]
conf_int_height_95 = [sampledf['height'].mean() - 1.96*pstdev(sampledf['height'])/math.sqrt(50) , sampledf['height'].mean() + 1.96*pstdev(sampledf['height'])/math.sqrt(50)]
conf_int_height_99 = [sampledf['height'].mean() - 2.58*pstdev(sampledf['height'])/math.sqrt(50) , sampledf['height'].mean() + 2.58*pstdev(sampledf['height'])/math.sqrt(50)]

conf_int_weight_95 = [sampledf['weight'].mean() - 1.96*pstdev(sampledf['weight'])/math.sqrt(50) , sampledf['weight'].mean() + 1.96*pstdev(sampledf['weight'])/math.sqrt(50)]
conf_int_weight_99 = [sampledf['weight'].mean() - 2.58*pstdev(sampledf['weight'])/math.sqrt(50) , sampledf['weight'].mean() + 2.58*pstdev(sampledf['weight'])/math.sqrt(50)]

In [None]:
#confidence interval for confidence level 95 and 99
print(conf_int_height_95)
print(conf_int_height_99)
print("\nPopulation Mean (height) is "+str(df['height'].mean()))

In [None]:
#confidence interval for confidence level 95 and 99
print(conf_int_weight_95)
print(conf_int_weight_99)
print("\nPopulation Mean (weight) is "+str(df['weight'].mean()))