In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
water_data = pd.read_csv('english-water.txt', sep='\t')
water_data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [4]:
water_data.corr('pearson')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


In [5]:
water_data.corr('spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


In [8]:
water_data[water_data.location == 'South'].corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [9]:
water_data[water_data.location == 'North'].corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


In [11]:
bar_obs = np.array([
    # often, seldom
    [239, 515],  # male
    [203, 718],  # female
])

In [13]:
mcc = (bar_obs[0][0] * bar_obs[1][1] - bar_obs[0][1] * bar_obs[1][0]) / np.sqrt((bar_obs[0][0] + bar_obs[0][1]) * (bar_obs[0][0] + bar_obs[1][0]) * (bar_obs[0][1] + bar_obs[1][1]) * (bar_obs[1][0] + bar_obs[1][1]))
round(mcc, 4)

0.109

In [16]:
bar_g, bar_p, _, __ = stats.chi2_contingency(bar_obs)

In [17]:
bar_p

1.0558987006638725e-05

In [24]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [20]:
bar_male_count = sum(bar_obs[0])
bar_female_count = sum(bar_obs[1])

bar_male_sample = np.zeros(bar_male_count)
bar_female_sample = np.zeros(bar_female_count)

for i in range(bar_obs[0][0]):
    bar_male_sample[i] = 1

for i in range(bar_obs[1][0]):
    bar_female_sample[i] = 1

In [27]:
interval = proportions_diff_confint_ind(bar_male_sample, bar_female_sample)
interval

(0.053905233215813156, 0.13922183141523897)

In [28]:
round(interval[0], 4)

0.0539

In [31]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))


def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [32]:
proportions_diff_z_test(proportions_diff_z_stat_ind(bar_male_sample, bar_female_sample))

8.153453089576601e-06

In [33]:
social_survey = np.array([
    # Not OK, OK, Good
    [197, 111, 33],  # Happy
    [382, 685, 331],  # OK
    [110, 342, 333],  # Not happy
])

In [37]:
survey_g, survey_p, _, __ = stats.chi2_contingency(social_survey)

In [38]:
round(survey_g, 4)

293.6831

In [39]:
survey_p

2.4964299580093467e-62

In [41]:
survey_corr = np.sqrt(survey_g / (np.sum(social_survey)) * (3 - 1))
round(survey_corr, 4)

0.4824