In [18]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# independent samples

def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [85]:
# relative samples 

def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    
    """ confint for 2 relative proportion samples """
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_rel(sample1, sample2):
    
    """ z-statistic for 2 relative proportion samples """
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [86]:
# proportion z test

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [87]:
# 3
test = [1] * 10 + [0] * 24
control = [1] * 4 + [0] * 12
proportions_diff_z_test(proportions_diff_z_stat_ind(test, control), 'greater')

0.37293045872523534

In [88]:
# 4 
df = pd.read_csv('./banknotes.txt', '\t')
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], 
                                                    test_size=50, random_state=1)
clf = LogisticRegression()

clf.fit(X_train.iloc[:, :3], y_train)
pred1 = clf.predict(X_test.iloc[:, :3])

clf.fit(X_train.iloc[:, 3:], y_train)
pred2 = clf.predict(X_test.iloc[:, 3:])

proportions_diff_z_test(proportions_diff_z_stat_rel(pred1 != y_test, pred2 != y_test))

0.0032969384555543435

In [89]:
# 5
proportions_diff_confint_rel(pred1 != y_test, pred2 != y_test)

(0.059945206279614305, 0.30005479372038568)

In [90]:
# 6, 7
from scipy.stats import norm
z6 = (541.4 - 525)/(100/10)
z7 = (541.5 - 525)/(100/10)
1 - norm.cdf(z6), 1 - norm.cdf(z7)

(0.050502583474103968, 0.049471468033648103)