In [35]:
import itertools
import numpy as np
import pandas as pd
from scipy import stats

In [3]:
stats.wilcoxon?

[0;31mSignature:[0m
[0mstats[0m[0;34m.[0m[0mwilcoxon[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mx[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzero_method[0m[0;34m=[0m[0;34m'wilcox'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorrection[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malternative[0m[0;34m=[0m[0;34m'two-sided'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate the Wilcoxon signed-rank test.

The Wilcoxon signed-rank test tests the null hypothesis that two
related paired samples come from the same distribution. In particular,
it tests whether the distribution of the differences x - y is symmetric
about zero. It is a non-parametric version of the paired T-test.

Parameters
----------
x : array_like
    Either the firs

In [6]:
X = np.array([49,58,75,110,112,132,151,276,281,362])
m0 = 200
cancer_result = stats.wilcoxon(X - m0, mode='approx')
cancer_result

WilcoxonResult(statistic=17.0, pvalue=0.2845026979112075)

In [7]:
round(cancer_result.pvalue, 4)

0.2845

In [11]:
cut_forest = np.array([22,22,15,13,19,19,18,20,21,13,13,15])
non_cut_forest = np.array([17,18,18,15,12,4,14,15,10])
forest_res = stats.mannwhitneyu(cut_forest, non_cut_forest, alternative='greater')
forest_res

MannwhitneyuResult(statistic=81.0, pvalue=0.02900499272087373)

In [12]:
round(forest_res.pvalue, 4)

0.029

In [13]:
with open('challenger.txt') as fp:
    print(fp.readline())

	Temperature	Incident



In [14]:
challenger_data = []
with open('challenger.txt') as fp:
    fp.readline()
    for line in fp:
        data = line.strip().split('\t')
        if not data:
            continue
        challenger_data.append(dict(
            date=data[0],
            t=float(data[1]),
            incident=int(data[2]),
        ))
challenger_data = pd.DataFrame(challenger_data)

In [15]:
challenger_data.head()

Unnamed: 0,date,t,incident
0,Apr12.81,18.9,0
1,Nov12.81,21.1,1
2,Mar22.82,20.6,0
3,Nov11.82,20.0,0
4,Apr04.83,19.4,0


In [16]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples


def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [19]:
np.random.seed(0)
challenger_no_incident_samples = get_bootstrap_samples(challenger_data[challenger_data.incident == 0].t.values, 1000)
challenger_incident_samples = get_bootstrap_samples(challenger_data[challenger_data.incident == 1].t.values, 1000)

In [25]:
challenger_no_incident_means = np.array([np.mean(x) for x in challenger_no_incident_samples])
challenger_incident_means = np.array([np.mean(x) for x in challenger_incident_samples])
challenger_mean_diffs = challenger_no_incident_means - challenger_incident_means

In [30]:
challenger_interval = stat_intervals(challenger_mean_diffs, 0.05)
challenger_interval

array([1.42299107, 7.93861607])

In [31]:
round(challenger_interval[0], 4)

1.423

In [43]:
def permutation_t_stat_ind(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)


def get_random_combinations(n1, n2, max_combinations):
    index = list(range(n1 + n2))
    indices = set([tuple(index)])
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]


def permutation_zero_dist_ind(sample1, sample2, max_combinations = None):
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)
    
    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]
    
    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() \
             for i in indices]
    return distr


def permutation_test(sample, mean, max_permutations = None, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_ind(sample, mean)
    
    zero_distr = permutation_zero_dist_ind(sample, mean, max_permutations)
    
    if alternative == 'two-sided':
        return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)
    
    if alternative == 'less':
        return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'greater':
        return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)

In [49]:
np.random.seed(0)
permutation_test(
    challenger_data[challenger_data.incident == 0].t.values, 
    challenger_data[challenger_data.incident == 1].t.values, 
    10000, alternative='two-sided'
)

0.007