In [3]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
import statsmodels
from statsmodels.stats.proportion import proportion_confint

In [6]:
def proportions_diff_confint_ind(positive_1, all_1, positive_2, all_2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = positive_1 / all_1
    p2 = positive_2 / all_2
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ all_1 + p2 * (1 - p2)/ all_2)
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ all_1 + p2 * (1 - p2)/ all_2)
    
    return (left_boundary, right_boundary)

In [5]:
def proportions_diff_z_stat_ind(positive_1, all_1, positive_2, all_2):
    n1 = all_1
    n2 = all_2
    
    p1 = positive_1 / all_1
    p2 = positive_2 / all_2
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [7]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [8]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" %\
      proportions_diff_confint_ind(10,34,4,16))

95% confidence interval for a difference between proportions: [-0.217558, 0.305793]


In [9]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(10,34,4,16)))

p-value: 0.745861


In [155]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(10,34,4,16), 'greater'))

p-value: 0.372930


In [151]:
# answer_3
ans_3=0.6271
ans_3_2=0.3729

In [15]:
banknot=pd.read_csv("banknotes.txt",sep="\t")

In [16]:
banknot

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1
...,...,...,...,...,...,...,...
195,215.0,130.4,130.3,9.9,12.1,139.6,0
196,215.1,130.3,129.9,10.3,11.5,139.7,0
197,214.8,130.3,130.4,10.6,11.1,140.0,0
198,214.7,130.7,130.8,11.2,11.2,139.4,0


In [41]:
X=banknot.iloc[:,0:6]
y=banknot.iloc[:,6]

In [43]:
#X

In [24]:
import sklearn
from sklearn.model_selection import train_test_split

In [25]:
#help(model_selection.train_test_split)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [35]:
#X_train.shape

In [36]:
from sklearn import datasets, linear_model, metrics

In [49]:
log_regressor_123 = linear_model.LogisticRegression(random_state = 1)
log_regressor_456 = linear_model.LogisticRegression(random_state = 1)

In [50]:
train_123=X_train.iloc[:,0:3]
train_456=X_train.iloc[:,3:6]

In [51]:
#train_456
log_regressor_123.fit(train_123, y_train)
log_regressor_456.fit(train_456, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
predictions_123 = log_regressor_123.predict(X_test.iloc[:,0:3])
predictions_456 = log_regressor_456.predict(X_test.iloc[:,3:6])

In [57]:
print (metrics.accuracy_score(y_test, predictions_123))

0.8


In [58]:
print (metrics.accuracy_score(y_test, predictions_456))

0.98


In [165]:
good_answer_1=y_test*predictions_123+(1-y_test)*(1-predictions_123)
good_answer_2=y_test*predictions_456+(1-y_test)*(1-predictions_456)

In [167]:
#good_answer_2

In [162]:
#predictions_123

In [59]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [60]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [168]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" \
      % proportions_diff_confint_rel(good_answer_1, good_answer_2))

95% confidence interval for a difference between proportions: [-0.300055, -0.059945]


In [169]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(good_answer_1, good_answer_2)))

p-value: 0.003297


In [63]:
# answer_4 
ans_4=1 # (0.122823 => 1)
ans_4_2=3 # (0.003297 => 3)
# answer_5
ans_5=0.0270
ans_5_2=-0.0599

In [69]:
#help(stats.ttest_ind)

Допустим, нам откуда-то известно, что дисперсия auc_scores $\sigma^2=0.25$. Построим доверительные интервалы для средних вида $$\bar{X}_n \pm z_{1-\frac{\alpha}{2}} \frac{\sigma}{\sqrt{n}}$$

In [72]:
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

In [83]:
print("sgd model mean auc 95%% confidence interval", _zconfint_generic(525, 
                                                                  100/np.sqrt(100), 
                                                                  0.05, 'smaller'))
#'two-sided'  'larger' 'smaller'

sgd model mean auc 95%% confidence interval (-inf, 541.4485362695148)


In [91]:
_zconfint_generic(525,100/np.sqrt(100),  0.05, 'smaller')

(-inf, 541.4485362695148)

In [97]:
#from statsmodels.stats.weightstats import ztest_mean

In [112]:
from statsmodels.stats import tests
from statsmodels.stats import stattools

In [120]:
#help(stats)

In [119]:
#help(statsmodels.stats.weightstats.DescrStatsW)

In [118]:
#ztest_mean()

In [126]:
scipy.stats.norm.ppf((541.4-525)/100)

-0.9781502862624724

In [134]:
(scipy.stats.norm.ppf((541.4-525)/100)+1)#*2

0.021849713737527643

In [135]:
help(scipy.stats.norm.ppf)

Help on method ppf in module scipy.stats._distn_infrastructure:

ppf(q, *args, **kwds) method of scipy.stats._continuous_distns.norm_gen instance
    Percent point function (inverse of `cdf`) at q of the given RV.
    
    Parameters
    ----------
    q : array_like
        lower tail probability
    arg1, arg2, arg3,... : array_like
        The shape parameter(s) for the distribution (see docstring of the
        instance object for more information)
    loc : array_like, optional
        location parameter (default=0)
    scale : array_like, optional
        scale parameter (default=1)
    
    Returns
    -------
    x : array_like
        quantile corresponding to the lower tail probability q.



In [140]:
scipy.stats.norm.ppf(0)

-inf

In [146]:
1-scipy.stats.norm.cdf((541.4-525)*10/100)

0.05050258347410397

In [147]:
1-scipy.stats.norm.cdf((541.5-525)*10/100)

0.0494714680336481

In [170]:
# answer_6 
ans_6=0.0505
# answer_7
ans_7=0.0495