In [49]:
import math
import pandas as pd
from scipy.stats import norm
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

In [2]:
data = []
with open('english-water.txt') as fp:
    print(fp.readline())
    for line in fp:
        parts = line.strip().split('\t')
        if not parts:
            continue

        data.append(dict(
            location=parts[0],
            town=parts[1],
            mortality=int(parts[2]),
            hardness=int(parts[3]),
        ))

data = pd.DataFrame(data)

location	town	mortality	hardness



In [3]:
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [4]:
data.describe()

Unnamed: 0,mortality,hardness
count,61.0,61.0
mean,1524.147541,47.180328
std,187.668754,38.093966
min,1096.0,5.0
25%,1379.0,14.0
50%,1555.0,39.0
75%,1668.0,75.0
max,1987.0,138.0


In [20]:
norm_low = norm.ppf(0.025)
norm_hi = norm.ppf(0.975)

In [25]:
norm_low

-1.9599639845400545

In [26]:
norm_hi

1.959963984540054

In [50]:
_tconfint_generic?

[0;31mSignature:[0m [0m_tconfint_generic[0m[0;34m([0m[0mmean[0m[0;34m,[0m [0mstd_mean[0m[0;34m,[0m [0mdof[0m[0;34m,[0m [0malpha[0m[0;34m,[0m [0malternative[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
generic t-confint based on summary statistic

Parameters
----------
mean : float or ndarray
    Value, for example mean, of the first sample.
std_mean : float or ndarray
    Standard error of the difference value1 - value2
dof : int or float
    Degrees of freedom
alpha : float
    Significance level for the confidence interval, coverage is
    ``1-alpha``.
alternative : str
    The alternative hypothesis, H1, has to be one of the following

       * 'two-sided' : H1: ``value1 - value2 - diff`` not equal to 0.
       * 'larger' :   H1: ``value1 - value2 - diff > 0``
       * 'smaller' :  H1: ``value1 - value2 - diff < 0``

Returns
-------
lower : float or ndarray
    Lower confidence limit. This is -inf for the one-sided alternative
    "smaller".

In [21]:
mort_mean = data.mortality.mean()
mort_std = data.mortality.std()
mort_cnt = data.mortality.count()

In [51]:
_tconfint_generic(mort_mean, mort_std / math.sqrt(mort_cnt), mort_cnt - 1, 0.05, 'two-sided')

(1476.0833413552848, 1572.2117406119285)

In [28]:
mort_south_mean = data[data.location == 'South'].mortality.mean()
mort_south_std = data[data.location == 'South'].mortality.std()
mort_south_cnt = data[data.location == 'South'].mortality.count()

In [29]:
mort_south_cnt

26

In [52]:
_tconfint_generic(mort_south_mean, mort_south_std / math.sqrt(mort_south_cnt), mort_south_cnt - 1, 0.05, 'two-sided')

(1320.1517462936238, 1433.463638321761)

In [33]:
mort_north_mean = data[data.location == 'North'].mortality.mean()
mort_north_std = data[data.location == 'North'].mortality.std()
mort_north_cnt = data[data.location == 'North'].mortality.count()

In [34]:
mort_north_cnt

35

In [35]:
round(mort_north_mean + norm_low * mort_north_std / math.sqrt(mort_north_cnt), 4)

1588.2336

In [36]:
round(mort_north_mean + norm_hi * mort_north_std / math.sqrt(mort_north_cnt), 4)

1678.9664

In [37]:
hardness_stats = data.groupby('location')['hardness'].agg(['mean', 'std', 'count'])

In [38]:
hardness_stats

Unnamed: 0_level_0,mean,std,count
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
North,30.4,26.134494,35
South,69.769231,40.360682,26


In [46]:
for loc, row in hardness_stats.iterrows():
    print('Loc:', loc)
    print('Low:', row['mean'] + norm_low * row['std'] / math.sqrt(row['count']))
    print('Hi:', row['mean'] + norm_hi * row['std'] / math.sqrt(row['count']))

Loc: North
Low: 21.741789085882075
Hi: 39.058210914117915
Loc: South
Low: 54.25536932516893
Hi: 85.2830922132926
