<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Confidence Intervals for Statistical Measurements


---

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [3]:
from scipy import stats as stats
from sklearn.datasets import load_boston

### Load the Boston housing data

In [4]:
data = load_boston()

### Determine 95 % confidence intervals for the mean of all the variables (target and predictors)

Calculate with both formulas using either normal or t-distribution.

**Bonus:** Calculate 90 % and 99 % confidence intervals.

#### Target variable

In [5]:
data.target.mean() + stats.norm.ppf([0.05, 0.95])*stats.sem(data.target)

array([21.86028958, 23.20532307])

In [6]:
for level in [0.9, 0.95, 0.99]:
    print(level, np.around(stats.norm.interval(
        level, data.target.mean(axis=0), stats.sem(data.target, axis=0)), 4))

0.9 [21.8603 23.2053]
0.95 [21.7315 23.3342]
0.99 [21.4796 23.586 ]


In [12]:
list(zip(data.feature_names,
         *np.around(stats.t(df=len(data.data)-1,
                            loc=data.data.mean(axis=0),
                            scale=stats.sem(data.data, axis=0)).interval(0.9), 4)))

[('CRIM', array([2.983400e+00, 9.655100e+00, 1.063420e+01, 5.060000e-02,
         5.462000e-01, 6.233200e+00, 6.651280e+01, 3.640800e+00,
         8.911500e+00, 3.958906e+02, 1.829690e+01, 3.499860e+02,
         1.212990e+01])),
 ('ZN', array([4.243600e+00, 1.307220e+01, 1.163930e+01, 8.780000e-02,
         5.632000e-01, 6.336100e+00, 7.063700e+01, 3.949300e+00,
         1.018730e+01, 4.205837e+02, 1.861410e+01, 3.633620e+02,
         1.317620e+01]))]

In [8]:
list(zip(data.feature_names,
         *np.around(stats.t(df=len(data.data)-1,
                            loc=data.data.mean(axis=0),
                            scale=stats.sem(data.data, axis=0)).interval(0.95), 4)))

[('CRIM', 2.8623, 4.3648),
 ('ZN', 9.3266, 13.4006),
 ('INDUS', 10.5376, 11.736),
 ('CHAS', 0.047, 0.0914),
 ('NOX', 0.5446, 0.5648),
 ('RM', 6.2233, 6.346),
 ('AGE', 66.1164, 71.0334),
 ('DIS', 3.6111, 3.979),
 ('RAD', 8.7889, 10.3099),
 ('TAX', 393.5171, 422.9572),
 ('PTRATIO', 18.2664, 18.6446),
 ('B', 348.7003, 364.6478),
 ('LSTAT', 12.0294, 13.2768)]

In [9]:
list(zip(data.feature_names,
         *np.around(stats.t(df=len(data.data)-1,
                            loc=data.data.mean(axis=0),
                            scale=stats.sem(data.data, axis=0)).interval(0.99), 4)))

[('CRIM', 2.6248, 4.6022),
 ('ZN', 8.6829, 14.0444),
 ('INDUS', 10.3482, 11.9253),
 ('CHAS', 0.04, 0.0984),
 ('NOX', 0.5414, 0.568),
 ('RM', 6.2039, 6.3654),
 ('AGE', 65.3394, 71.8104),
 ('DIS', 3.553, 4.0371),
 ('RAD', 8.5486, 10.5503),
 ('TAX', 388.8648, 427.6095),
 ('PTRATIO', 18.2067, 18.7044),
 ('B', 346.1802, 367.1678),
 ('LSTAT', 11.8322, 13.4739)]

### Bonus: How many more observations would we need to make the confidence interval 10 times narrower assuming that the sample mean and standard deviation remain the same?

In [10]:
confint = data.target.mean() + \
    stats.norm.ppf([0.05, 0.95])*data.target.std(ddof=1)/len(data.target)**0.5

confint[1]-confint[0]

1.3450334827617212

In [11]:
confint_narrow = data.target.mean() \
    + stats.norm.ppf([0.05, 0.95])*data.target.std(ddof=1) / \
    (len(data.target)*100)**0.5

confint_narrow[1]-confint_narrow[0]

0.13450334827616928