In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, linear_model, metrics
import scipy
from scipy import stats
from statsmodels.stats.weightstats import *

In [2]:
def proportions_diff_z_stat_ind(s1, n1, s2, n2):
 
    p1 = s1 / n1
    p2 = s2 / n2
    
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [3]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [4]:
proportions_diff_z_test(proportions_diff_z_stat_ind(10, 34, 4, 16), 'greater') #3

0.37293045872523534

In [6]:
X = df.drop(columns=['real'])
y = df['real']

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=1)

In [9]:
clf_1 = linear_model.LogisticRegression()
clf_2 = linear_model.LogisticRegression()

In [10]:
clf_1.fit(X_train.iloc[:, :3], y_train)
clf_2.fit(X_train.iloc[:, 3:], y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
predict_1 = clf_1.predict(X_test.iloc[:, :3])
predict_2 = clf_2.predict(X_test.iloc[:, 3:])

In [12]:
metrics.accuracy_score(y_test, predict_1)

0.8

In [13]:
metrics.accuracy_score(y_test, predict_2)

0.98

In [19]:
test_labels_1 = 1 - predict_1
test_labels_2 = 1 - predict_2
metrics.accuracy_score(y_test, test_labels_2)

0.02

In [39]:
test_1 = np.abs(predict_1 - y_test)
test_2 = np.abs(predict_2 - y_test)

In [20]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [21]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [22]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [23]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [24]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [40]:
proportions_diff_confint_rel(test_1, test_2) #5   #4 - 3

(0.059945206279614305, 0.3000547937203857)

In [41]:
var = proportions_diff_z_stat_ind(test_1, test_2)

In [42]:
proportions_diff_z_test(var)

0.004022237272055307

In [45]:
mu = 525
sigma = 100
mu0 = 541.4
n = 100

In [46]:
z = (mu - mu0) / (sigma/(n**0.5)) #6, 7
scipy.stats.norm.cdf(z)

0.05050258347410395