# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [None]:
import numpy as np
from datascience import *

# These lines set up graphing capabilities.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

## Hypothesis Test Simulation: Extraversion and wanting more siblings?
#### (Who is Older? a.k.a. A/B Testing)
In our [Class Survey Analysis Assignment](https://smccd.instructure.com/courses/59594/assignments/1644639) some of us posed hypotheses that were prime for analysis with A/B Testing. For this example, we will test the hypothesis that whether or not extroverted individuals are **more likely** to wish they had more siblings

In [None]:
math211_survey = Table().read_table('MATH_211_Survey_Cleaned_GPA.csv').select(2,12,13)
math211_survey

### "Define my hypothesis"

In [None]:
def more_or_less_siblings(num_siblings_have,num_siblings_wish):
    if num_siblings_have > num_siblings_wish:
        return 'Less'
    elif num_siblings_have < num_siblings_wish:
        return 'More'
    else:
        return 'same'

sibling_classification_array = math211_survey.apply(more_or_less_siblings,'Num_Siblings','Wish_Siblings')
math211_survey = math211_survey.with_column('more/less',sibling_classification_array)
math211_survey

### "Define my test statistic" based upon my Hypothesis

In [None]:
math211_survey_to_study = math211_survey.where('more/less',are.not_equal_to('same'))
math211_survey_to_study

In [None]:
grouped_by_more_less = math211_survey_to_study.group('more/less',np.average)
grouped_by_more_less

In [None]:
intro_extra_averages = grouped_by_more_less.column('Introversion/Extraversion average')
intro_extra_averages

In [None]:
intro_extra_averages

In [None]:
test_statistic = intro_extra_averages.item(0) - intro_extra_averages.item(1)
test_statistic

### Shuffle the labels of the original sample

In [None]:
# Run this cell a few times to observe that the order of the rows keep changing each time we run it
math211_survey_to_study.sample( math211_survey_to_study.num_rows, with_replacement = False)

In [None]:
shuffled_labels = math211_survey_to_study.sample( math211_survey_to_study.num_rows, with_replacement = False).column('more/less')
shuffled_labels

In [None]:
simulated_shuffle = math211_survey_to_study.with_column('shuffled_label',shuffled_labels)
simulated_shuffle

In [None]:
simulated_shuffle.select('Introversion/Extraversion','more/less','shuffled_label')

### Find your simulated test statistic

In [None]:
grouped_by_more_less = simulated_shuffle.group('shuffled_label',np.average)
intro_extra_averages = grouped_by_more_less.column('Introversion/Extraversion average')
test_statistic_shuffle = intro_extra_averages.item(0) - intro_extra_averages.item(1)
test_statistic_shuffle

### Repeat many times

In [None]:
num_simulations = 1000
simulated_statistics_ab = make_array()

for i in np.arange(num_simulations):
    shuffled_labels = math211_survey_to_study.sample( math211_survey_to_study.num_rows, with_replacement = False).column('more/less')
    simulated_shuffle = math211_survey_to_study.with_column('shuffled_label',shuffled_labels)
    
    grouped_by_more_less = simulated_shuffle.group('shuffled_label',np.average)
    intro_extra_averages = grouped_by_more_less.column('Introversion/Extraversion average')
    test_statistic_shuffle = intro_extra_averages.item(0) - intro_extra_averages.item(1)
    simulated_statistics_ab = np.append(test_statistics_ab,test_statistic_shuffle)

simulated_statistics_ab

### Calculate p-value

In [None]:
observed_diff_ab = test_statistic

In [None]:
Table().with_columns('Simulated Statistic', simulated_statistics_ab).hist()
plt.scatter(observed_diff_ab, -0.002, color='red', s=70);

In [None]:
p_value = np.count_nonzero(simulated_statistics_ab < observed_diff_ab)/num_simulations
p_value