In [153]:
import pandas as pd
import numpy as np
from scipy.stats import norm, t

In [154]:
df = pd.read_excel('../height.xls', usecols="A,D", dtype={'Leningrad': float, 'Moscow': float}, names=["Leningrad", "Moscow"])
df

Unnamed: 0,Leningrad,Moscow
0,184.0,165.0
1,183.0,170.0
2,183.0,174.0
3,181.0,179.0
4,180.0,166.0
5,180.0,180.0
6,179.0,172.0
7,179.0,176.0
8,179.0,177.0
9,178.0,181.0


## Sigma known

In [155]:
def get_confidence_interval_known(variance, sample_mean, sample_size, significance_level):
    # norm.ppf - quantile function for normal distribution (inverse of cumulative distribution function)
    margin_of_error = norm.ppf(1 - significance_level/2) * variance / np.sqrt(sample_size)
    return sample_mean - margin_of_error, sample_mean + margin_of_error

In [156]:
data1 = df['Leningrad'].dropna() ## dropna() - drop NaN values
sample_size = data1.size
sample_mean = np.average(data1)
sigma = np.sqrt(30)
data1

0     184.0
1     183.0
2     183.0
3     181.0
4     180.0
5     180.0
6     179.0
7     179.0
8     179.0
9     178.0
10    177.0
11    176.0
12    176.0
13    175.0
14    175.0
15    175.0
16    175.0
17    175.0
18    175.0
19    173.0
20    172.0
21    171.0
22    170.0
23    172.0
24    169.0
25    168.0
26    171.0
27    167.0
28    169.0
Name: Leningrad, dtype: float64

In [157]:
significance_level_known1 = 0.9
confidence_interval_known1 = get_confidence_interval_known(sigma, sample_mean, sample_size, significance_level_known1)

confidence_interval_known1

(175.2859835437709, 175.54160266312567)

In [158]:
significance_level_known2 = 0.7
confidence_interval_known2 = get_confidence_interval_known(sigma, np.average(data1), data1.size, significance_level_known2)

confidence_interval_known2

(175.0218854852446, 175.80570072165196)

## Sigma unknown

In [159]:
def get_confidence_interval_unknown(sample_variance, sample_mean, sample_size, significance_level):
    #t.ppf - quantile function for student distribution (inverse of cumulative distribution function)
    degrees_of_freedom = sample_size - 1
    margin_of_error = t.ppf(1 - (significance_level / 2), degrees_of_freedom) * sample_variance / np.sqrt(sample_size - 1)
    return sample_mean - margin_of_error, sample_mean + margin_of_error

In [160]:
data2 = df['Moscow'].dropna() ## dropna() - drop NaN values
sample_size = data2.size
sample_mean = np.average(data2)
sample_variance = sum([(x_i - sample_mean) ** 2 for x_i in data2]) / sample_size
data2

0     165.0
1     170.0
2     174.0
3     179.0
4     166.0
5     180.0
6     172.0
7     176.0
8     177.0
9     181.0
10    165.0
11    167.0
12    175.0
13    178.0
14    172.0
15    172.0
16    179.0
17    162.0
18    168.0
19    183.0
20    165.0
21    164.0
22    171.0
23    176.0
24    182.0
25    179.0
26    187.0
27    173.0
28    181.0
29    179.0
30    195.0
31    170.0
32    188.0
33    181.0
Name: Moscow, dtype: float64

In [161]:
significance_level_unknown1 = 0.9
confidence_interval_unknown1 = get_confidence_interval_unknown(sample_variance, sample_mean, sample_size, significance_level_unknown1)

confidence_interval_unknown1

(173.81407202598018, 176.30357503284336)

In [162]:
significance_level_unknown2 = 0.7
confidence_interval_unknown2 = get_confidence_interval_unknown(sample_variance, sample_mean, sample_size, significance_level_unknown2)

confidence_interval_unknown2

(171.23810298699837, 178.87954407182517)