In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy import stats

In [3]:
data = pd.read_csv('water.txt', delimiter = '\t')

In [4]:
data

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18
...,...,...,...,...
56,South,Walsall,1527,60
57,South,West Bromwich,1627,53
58,South,West Ham,1486,122
59,South,Wolverhampton,1485,81


In [16]:
data[data['location'] == 'South'].corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [5]:
data[data['location'] == 'North'].corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


In [17]:
(203 * 515 - 718 * 239) / np.sqrt((203 + 718) * (203 + 239) * (239 + 515) * (718 + 515))

-0.10900237458678963

In [18]:

obs = np.array([[203, 239], [718, 515]])
stats.chi2_contingency(obs)

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[243.03402985, 198.96597015],
        [677.96597015, 555.03402985]]))

In [19]:
1.0558987006638725e-05

1.0558987006638725e-05

In [22]:
# 203 женщины, 239 мужчин
women = 203 / (203 + 718)
men = 239 / (515 + 239)

In [24]:
men - women

0.09656353231552606

In [25]:
def proportions_confint_diff_independant(suc1, len1, suc2, len2, alpha = 0.05):    
    z = stats.norm.ppf(1 - alpha / 2.)   
    p1 = suc1 / len1
    p2 = suc2 / len2
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len1 + p2 * (1 - p2)/ len2)
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len1 + p2 * (1 - p2)/ len2)
    
    return (left_boundary, right_boundary)

proportions_confint_diff_independant(239, 239 + 515, 203, 203 + 718)

(0.053905233215813156, 0.13922183141523897)

In [26]:
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import samplesize_confint_proportion
from statsmodels.stats.weightstats import *
from scipy import stats
import scipy

def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)
    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [27]:
p = (203 + 239) / (203 + 239 + 515 + 718)
z_stat = (men - women) / np.sqrt(p * (1 - p) * (1 / (203 + 718)) * 1 / (239 + 515))

In [30]:
a = proportions_diff_z_test(z_stat)

In [38]:
print(f"{a:.40f}")

0.0000000000000000000000000000000000000000


In [40]:
n1 = 203 + 718
n2 = 239 + 515
p1 = 203 / n1
p2 = 239 / n2 
P = float(p1*n1 + p2*n2) / (n1 + n2)
z_stat = (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [41]:
a = proportions_diff_z_test(z_stat)

In [42]:
print(f"{a:.40f}")

0.0000081534530895766010871739126741886139


In [6]:
table = [[197, 111, 33], [382, 685, 331], [110, 342, 333]]

In [7]:
stats.chi2_contingency(table)

(293.68311039689746,
 2.4964299580093467e-62,
 4,
 array([[ 93.08597464, 153.74722662,  94.16679873],
        [381.6251981 , 630.318542  , 386.0562599 ],
        [214.28882726, 353.93423138, 216.77694136]]))

In [8]:
def cramers_v(confusion_matrix):
    if confusion_matrix.shape[0] == 1:
        return 0
    else:
        n = confusion_matrix.sum().sum()
        if n == 0:
            return 0
        chi2 = stats.chi2_contingency(confusion_matrix)[0]
        phi2 = chi2 / n
        r,k = confusion_matrix.shape
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))