In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Dolphin Therapy

### Construct table of the data

In [None]:
Therapy = np.append(
    np.repeat('Dolphin', 13), 
    np.repeat('No Dolphin', 12)
)

Therapy

In [None]:
Improvement = np.append(
    np.append(np.repeat(1, 10), np.repeat(0, 3)),
    np.append(np.repeat(1, 3), np.repeat(0, 9))
)

Improvement    

In [None]:
dolphin = Table().with_columns(
    'Therapy', Therapy,
    'Improvement', Improvement
)

dolphin.show()

### Determine observed difference

In [None]:
dolphin_summary = dolphin.group('Therapy', np.mean)
dolphin_summary

In [None]:
obs_diff_prop = dolphin_summary.column(1).item(0) - dolphin_summary.column(1).item(1)
obs_diff_prop

In [None]:
def difference_of_props(table, group_label, outcome):
    """Takes: name of table, column label of binary categorical variable,
    column label of group-label variable
    Returns: Difference of proportions of the two groups"""
    
    #table with the two relevant columns
    reduced = table.select(outcome, group_label)  
    
    # table containing counts for each group
    summary_table = reduced.group(group_label, np.mean)
    
    return summary_table.column(1).item(0) - summary_table.column(1).item(1)

In [None]:
difference_of_props(dolphin, 'Therapy', 'Improvement')

### Simulate one outcome

In [None]:
shuffled_labels = dolphin.sample(with_replacement=False).column('Therapy')
shuffled_labels

In [None]:
original_and_shuffled = dolphin.with_column(
    'Shuffled Label', shuffled_labels
)

original_and_shuffled

In [None]:
difference_of_props(original_and_shuffled, 'Shuffled Label', 'Improvement')

In [None]:
def one_simulated_difference(table, group_label, outcome):
    """Takes: name of table, column label of binary cateogorcial variable,
    column label of group-label variable
    Returns: Difference of proportions of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False).column(group_label)
    
    # table of outcome variable and shuffled labels
    shuffled_table = table.select(outcome).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_props(shuffled_table, 'Shuffled Label', outcome)  

In [None]:
one_simulated_difference(dolphin, 'Therapy', 'Improvement')

### Simulate many outcomes

In [None]:
differences = make_array()

for i in np.arange(10000):
    new_difference = one_simulated_difference(dolphin, 'Therapy', 'Improvement')
    differences = np.append(differences, new_difference)

### Use simulation to make a decision

In [None]:
Table().with_column('Difference Between Group Proportions', differences).hist()
print('Observed Difference:', obs_diff_prop)
plots.title('Prediction Under the Null Hypothesis');
plots.plot([obs_diff_prop, obs_diff_prop], [0, 2], color='red', lw=2);

In [None]:
p_value = sum(differences>= obs_diff_prop)/10000
p_value