In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, linear_model, metrics
import scipy
from scipy import stats
from statsmodels.stats.weightstats import *

In [2]:
import itertools

from statsmodels.stats.descriptivestats import sign_test

In [3]:
arr = np.array([49,58,75,110,112,132,151,276,281,362])

In [4]:
zconfint(arr)

(93.8088290359675, 227.39117096403248)

In [5]:
stats.wilcoxon(arr - 200) #4

WilcoxonResult(statistic=17.0, pvalue=0.2845026979112075)

In [17]:
species_no = np.array([22, 22, 15, 13, 19, 19, 18, 20, 21, 13, 13, 15])
species_yes = np.array([17, 18, 18, 15, 12, 4, 14, 15, 10])

In [18]:
stats.mannwhitneyu(species_yes, species_no, alternative='less')

MannwhitneyuResult(statistic=27.0, pvalue=0.02900499272087373)

In [21]:
stats.mannwhitneyu(species_no, species_yes, alternative='greater')

MannwhitneyuResult(statistic=81.0, pvalue=0.02900499272087373)

In [52]:
stats.wilcoxon(species_yes - species_no.mean()) #5



WilcoxonResult(statistic=4.0, pvalue=0.048550746423148196)

In [27]:
challenger = pd.read_csv('challenger.txt', sep='\t', index_col=0)

In [28]:
challenger_yes = challenger[challenger.Incident == 1].Temperature
challenger_no = challenger[challenger.Incident == 0].Temperature

challenger_yes

Nov12.81    21.1
Feb03.84    13.9
Apr06.84    17.2
Aug30.84    21.1
Jan24.85    11.7
Oct30.85    23.9
Jan12.86    14.4
Name: Temperature, dtype: float64

In [12]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [13]:
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [16]:
np.random.seed(0)

yes_mean_scores = np.array(list(map(np.mean, get_bootstrap_samples(challenger_yes, 1000))))
no_mean_scores = np.array(list(map(np.mean, get_bootstrap_samples(challenger_no, 1000))))

print("95% confidence interval for the ILEC median repair time:",  stat_intervals(yes_mean_scores - no_mean_scores, 0.05)) #6

95% confidence interval for the ILEC median repair time: [-8.06457589 -1.45040179]


In [23]:
def permutation_t_stat_ind(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)

In [30]:
def get_random_combinations(n1, n2, max_combinations):
    index = list(range(n1 + n2))
    indices = set([tuple(index)])
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

In [31]:
def permutation_zero_dist_ind(sample1, sample2, max_combinations = None):
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)
    
    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]
    
    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() \
             for i in indices]
    return distr

In [32]:
def permutation_test(sample, mean, max_permutations = None, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_ind(sample, mean)
    
    zero_distr = permutation_zero_dist_ind(sample, mean, max_permutations)
    
    if alternative == 'two-sided':
        return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)
    
    if alternative == 'less':
        return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'greater':
        return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)

In [35]:
np.random.seed(0)
permutation_test(challenger_yes, challenger_no, max_permutations = 10000) #7

0.0057