In [5]:
import math
import numpy as np
import pandas as pd
import scipy
from scipy import stats
from statsmodels.stats.proportion import proportion_confint
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.weightstats import DescrStatsW


In [72]:
all_count = 50
exp_count = 34
ctrl_count = 16
exp_triggered = 10
ctrl_triggered = 4

ctrl_sample = np.zeros(ctrl_count)
for i in range(ctrl_triggered):
    ctrl_sample[i] = 1

exp_sample = np.zeros(exp_count)
for i in range(exp_triggered):
    exp_sample[i] = 1

In [73]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [1]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [2]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [76]:
z = proportions_diff_z_stat_ind(exp_sample, ctrl_sample)
z

0.32410186177608225

In [77]:
pv = proportions_diff_z_test(z, 'greater')
round(pv, 4)

0.3729

In [68]:
with open('banknotes.txt') as fp:
    print(fp.readline())

X1	X2	X3	X4	X5	X6	real



In [69]:
banknotes_data = []
with open('banknotes.txt') as fp:
    fp.readline()
    for line in fp:
        data = line.strip().split('\t')
        if not data:
            continue
        banknotes_data.append(dict(
            x1=float(data[0]),
            x2=float(data[1]),
            x3=float(data[2]),
            x4=float(data[3]),
            x5=float(data[4]),
            x6=float(data[5]),
            real=int(data[6]),
        ))
banknotes_data = pd.DataFrame(banknotes_data)

In [70]:
banknotes_data.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [71]:
bank_train_data, bank_test_data = train_test_split(banknotes_data, test_size=50, random_state=1)

In [72]:
bank_train_data.shape

(150, 7)

In [73]:
bank_test_data.shape

(50, 7)

In [74]:
feat_names1 = ['x1', 'x2', 'x3']
feat_names4 = ['x4', 'x5', 'x6']

In [108]:
est1 = LogisticRegression(fit_intercept=False)
est1.fit(bank_train_data[feat_names1], bank_train_data.real)

LogisticRegression(fit_intercept=False)

In [109]:
est4 = LogisticRegression(fit_intercept=False)
est4.fit(bank_train_data[feat_names4], bank_train_data.real)

LogisticRegression(fit_intercept=False)

In [110]:
forecast1 = est1.predict(bank_test_data[feat_names1])
forecast1

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0])

In [111]:
forecast4 = est4.predict(bank_test_data[feat_names4])
forecast4

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0])

In [112]:
misses1 = np.int64(bank_test_data.real.values != forecast1)
misses1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0])

In [113]:
misses4 = np.int64(bank_test_data.real.values != forecast4)
misses4

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [114]:
sum(misses1)

10

In [115]:
bank_test_data['fcst1'] = est1.predict(bank_test_data[feat_names1])
bank_test_data['fcst4'] = est4.predict(bank_test_data[feat_names4])

In [116]:
bank_test_data['misses1'] = 0
bank_test_data.loc[bank_test_data[bank_test_data.fcst1!=bank_test_data.real].index,
    'misses1'] = 1

bank_test_data['misses4'] = 0
bank_test_data.loc[bank_test_data[bank_test_data.fcst4!=bank_test_data.real].index,
    'misses4'] = 1

In [117]:
bank_test_data.misses1.sum()

10

In [118]:
bank_test_data.misses4.sum()

1

In [119]:
misses1_prop = sum(misses1) / misses1.shape[0]
misses1_prop

0.2

In [120]:
misses4_prop = sum(misses4) / misses4.shape[0]
misses4_prop

0.02

In [121]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [122]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [123]:
z = proportions_diff_z_stat_rel(misses1, misses4)
z

2.9386041680175268

In [124]:
pv = proportions_diff_z_test(z)
pv

0.0032969384555543435

In [132]:
misses1_prop - misses4_prop

0.18000000000000002

In [133]:
interval = proportions_diff_confint_rel(misses1, misses4)
interval

(0.059945206279614305, 0.3000547937203857)

In [134]:
round(interval[0], 4)

0.0599

In [127]:
f = bank_test_data[(bank_test_data.misses1==1)
&(bank_test_data.misses4==0)
].shape[0]
g = bank_test_data[(bank_test_data.misses1==0)
&(bank_test_data.misses4==1)
].shape[0]
zs = (f-g) / np.sqrt(f + g - (f - g)**2 / bank_test_data.shape[0])
zs

2.9386041680175268

In [131]:
proportions_diff_z_test(zs)

0.0032969384555543435

In [114]:
standard_m = 525
observed_m = 541.4
sig = 100
N = 100

Z = (observed_m - standard_m) / (sig / math.sqrt(N))
Z

1.6399999999999977

In [63]:
round(1 - stats.norm.cdf(Z), 4)

0.0505

In [64]:
standard_m = 525
observed_m = 541.5
sig = 100
N = 100

Z = (observed_m - standard_m) / (sig / math.sqrt(N))
Z

1.65

In [65]:
round(1 - stats.norm.cdf(Z), 4)

0.0495