In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 20

## Deflategate ##

In [None]:
# Pressure measured in "pounds per square inch" (psi)
# Two officials (Blakeman and Prioleau) measured pressure 
# of balls at half-time
# Most of Colts balls were not measured
football = Table.read_table('deflategate.csv')
football.show()

In [None]:
#Take average of two measurements from the different refs
combined = (football.column('Blakeman')+football.column('Prioleau'))/2
football = football.drop('Blakeman', 'Prioleau').with_column(
    'Combined', 
    combined)
football.show()

In [None]:
np.ones(5)

In [None]:
#Before the start of AFC game, ball pressures were measured
#NFL rule:  ball pressure between 12.5 and 13.5 psi
#Patriots' balls were all about 12.5 psi
#Colts' balls were about 13.0 psi

initial_pressure = np.append(12.5 * np.ones(11), 13 * np.ones(4))
initial_pressure

In [None]:
# Some deflation is normal during game; investigate the pressure drop
drop_values = initial_pressure - football.column(1)

In [None]:
# Let's create a Table with how much PSI were dropped in footballs from each team
football = football.drop('Combined').with_column('Drop', drop_values)

In [None]:
football.show()

In [None]:
# Mean psi drop for each team's footballs
means = football.group('Team', np.average)
means

In [None]:
# Get the difference in psi drops between the two teams
observed_difference = means.column(1).item(0) - means.column(1).item(1)
observed_difference

In [None]:
# A function to get the difference between means
def diff_between_means(tbl):
    means = tbl.group('Team', np.average).column(1)
    return means.item(0) - means.item(1)

In [None]:
# Get a table with just the drop column
drops = football.select('Drop')

In [None]:
# shuffle the values in the drop column
shuffled_drops = drops.sample(with_replacement = False).column(0)
shuffled_drops

In [None]:
# Create a simulated table under the null hypothesis
simulated_football = football.with_column('Drop', shuffled_drops)
simulated_football.show(3)

In [None]:
# get one statistic from the null distribution
diff_between_means(simulated_football)

In [None]:
# create the full null distribution
differences = make_array()

for i in np.arange(5000):
    shuffled_drops = drops.sample(with_replacement = False).column(0)
    simulated_football = football.with_column('Drop', shuffled_drops)
    new_diff = diff_between_means(simulated_football)
    differences = np.append(differences, new_diff)

In [None]:
# visualizet the null distribution
Table().with_column('Difference Between Means', differences).hist()
plots.scatter(observed_difference, 0, color='red', s=40);

In [None]:
# get the p-value
np.average(differences <= observed_difference)

## Analyzing RCTs ##

#Botulinum Toxin A (bta) as a treatment to chronic back pain
- 15 in the treatment group
- 16 in the control group (normal saline)

Trials were run double-blind (neither doctors nor patients knew which group they were in)
Result: 
  - 1 indicates pain relief
  - 0 indicates lack of pain relief 


In [None]:
bta = Table.read_table('bta.csv')
bta.show()

In [None]:
# get the counts in the treatment and control groups
bta.group('Group', sum)

In [None]:
# get the proportions in the treatment and control groups
bta.group('Group', np.average)

In [None]:
# looking at observed/potential outcomes table
# observed_outcomes = Table.read_table('observed_outcomes.csv')
# observed_outcomes.show()

In [None]:
# calculate the difference in proportions of treatment minus control
group_proportions = bta.group('Group', np.average).column(1)
group_proportions.item(1) - group_proportions.item(0)

In [None]:
# create a function that calculates the difference of proportions of treatment minus control
def distance_between_group_proportions(tbl):
    proportions = tbl.group('Group', np.average).column(1)
    return proportions.item(1) - proportions.item(0)

In [None]:
# store the real difference in a name
observed_distance = distance_between_group_proportions(bta)
observed_distance

In [None]:
# get tables with just the group labels, and just the result values
labels = bta.select('Group')
results = bta.select('Result')

In [None]:
# Create a null distribution
distances = make_array()
for i in np.arange(2000):
    shuffled_results = results.sample(with_replacement=False).column(0)
    simulated = labels.with_column('Shuffled results', shuffled_results)
    distance = distance_between_group_proportions(simulated)
    distances = np.append(distances, distance)

distances

In [None]:
# visualize the null distribution
Table().with_column('Distance', distances).hist(bins = np.arange(0, 1, 0.15))
plots.scatter(observed_distance, 0, color='red', s=40);

In [None]:
np.average(distances >= observed_distance)