# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*

In [1]:
import numpy as np
from datascience import *

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Exploring the percentile function

In [2]:
an_array_1_to_100 = np.arange(1,101)
an_array_1_to_100

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [3]:
percentile(25,an_array_1_to_100)

25

In [4]:
an_array_1_to_36 = np.arange(1,36+1)

In [6]:
percentile(50,an_array_1_to_36)

18

# Constructing Confidence Intervals

In [7]:
covid_data = Table.read_table("Covid Data_subset.csv")
covid_data

USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
2,1,1,1,3/5/2020,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
2,1,1,1,12/6/2020,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
2,1,1,1,9999-99-99,97,2,64,2,2,2,2,2,2,2,2,2,2,2,3,97
2,1,1,2,9999-99-99,2,2,25,2,2,2,2,2,2,2,2,2,2,2,3,2
2,1,2,2,9999-99-99,2,2,30,97,2,2,2,2,2,2,2,2,2,2,3,2
2,1,1,1,9999-99-99,97,2,23,2,2,2,2,2,2,2,2,2,2,2,3,97
2,1,2,1,9999-99-99,97,2,54,97,2,2,2,2,2,2,2,2,2,2,3,97
2,1,2,1,9999-99-99,97,2,30,97,2,2,2,2,2,2,2,2,2,2,3,97
2,1,1,1,9999-99-99,97,2,38,2,2,2,2,2,2,2,2,2,1,2,3,97
2,1,2,1,9999-99-99,97,2,49,97,2,2,2,2,2,2,2,2,2,2,3,97


In [8]:
#Let's consider this data set to be our population, and calculate the death rate of patients with covid.  
#Those with an impossible date for death can be assumed to live in this data set
100*(1 - (covid_data.where('DATE_DIED','9999-99-99').num_rows / covid_data.num_rows))

7.337672555611185

### Let's play out a scenario

Suppose that we are still in the early stages of the covid pandemic. You work for a hospital that has seen 100 patients so far. You want to know how deadly this illness is to the population. 

In [9]:
#we proceed by taking a sample of 100 from the covid table. Let's assume that these are the 100 patients we saw.
covid_sample = covid_data.sample(100,with_replacement = False)
covid_sample

USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
2,12,1,1,9999-99-99,97,2,48,2,2,2,2,2,1,2,2,2,2,2,7,97
2,12,2,1,9999-99-99,97,2,53,97,2,2,2,2,2,2,2,1,2,2,3,97
2,4,2,2,9999-99-99,2,2,57,97,1,2,2,2,1,2,2,1,2,2,7,2
2,4,1,1,9999-99-99,97,2,47,2,2,2,2,2,2,2,2,2,2,2,7,97
2,12,1,1,9999-99-99,97,2,52,2,2,2,2,2,2,2,2,2,2,1,3,97
2,12,2,1,9999-99-99,97,2,50,97,2,2,2,2,2,2,2,2,2,2,7,97
1,12,1,1,9999-99-99,97,2,28,2,2,2,2,2,2,2,2,2,2,1,3,97
1,9,1,1,8/5/2020,97,2,57,2,1,2,2,2,1,2,2,2,2,2,6,97
2,12,1,1,9999-99-99,97,2,44,2,2,2,2,2,2,2,2,2,2,2,7,97
2,12,1,1,9999-99-99,97,2,48,2,2,2,2,2,2,2,2,2,2,2,6,97


Then we can define the following functions to construct an upper and lower bound for the confidence interval

In [10]:
def confidence_interval_for_death_rate(level):
    stats = make_array() #create an empty array, these will contain death rates of re-sampled (size 100) patients
    for i in np.arange(1000):
        stat = 100*(1 - ((covid_sample.sample()).where('DATE_DIED','9999-99-99').num_rows / covid_sample.num_rows))
        # death rate calculated above, from the bootstrapped re-sample
        stats = np.append(stats,stat)
        # death rate stored in 'stats' array
    print(stats)
    lower_bound = str(percentile((100 - level)/2,stats))
    upper_bound = str(percentile( 100 - ((100 - level)/2) , stats))
    return 'We are '+ str(level)+'% confident that the true death rate is between '+lower_bound+' and '+upper_bound

In [11]:
confidence_interval_for_death_rate(95)

[  9.   8.   6.   5.  10.  14.   6.  14.   6.   7.   8.   8.   3.   5.   9.
  11.   8.   8.   7.   5.   8.  10.   6.   6.   8.   6.   8.   8.   5.   7.
   9.   4.   3.  10.   6.   7.   5.   7.  12.   7.   7.  11.   8.   8.  10.
   9.  10.   9.   7.   7.   9.  10.   9.   7.   9.   6.  10.   5.   8.   8.
   7.   9.  12.   8.   6.  15.  11.   9.   9.   8.   8.   7.   3.  10.   6.
  10.   4.   8.   9.   6.  11.  10.  11.  14.   8.   6.   3.   9.  10.   8.
   3.   7.   8.   4.   6.   8.   8.   9.   3.  12.   7.   5.   6.   6.   7.
   6.  12.   9.   4.   9.  10.   9.   6.  11.   6.   7.   8.   7.   4.  10.
   9.   5.   7.  12.  12.  15.   5.   7.   9.  10.  14.   9.   7.   4.   9.
   6.  10.   7.  10.   9.   6.   9.  12.  11.   9.   5.   8.   7.   9.   6.
   5.   5.  12.   8.  11.  10.   8.   9.   9.  10.  10.  10.   7.  11.   8.
   8.   6.  10.  12.   8.   6.  10.   9.  10.   7.   5.   5.  11.   2.   7.
   9.   6.  10.   6.  12.  10.   9.   8.  10.   5.  11.   7.   4.  12.  15.
   7.  12.  

'We are 95% confident that the true death rate is between 3.0 and 14.0'

In [12]:
def confidence_interval_lower_b(tbl,level,column_label):
    stats = make_array()
    for i in np.arange(10000):
        stat = np.mean(tbl.sample().column(column_label))
        stats = np.append(stats,stat)
    return percentile( (100 - level)/2 , stats)

def confidence_interval_upper_b(tbl,level,column_label):
    stats = make_array()
    for i in np.arange(10000):
        stat = np.mean(tbl.sample().column(column_label))
        stats = np.append(stats,stat)
    return percentile( 100 - ((100 - level)/2) , stats)

In [13]:
column_to_look_at = 'AGE'
level = 95
print('The confidence interval for',column_to_look_at) 
print('is from',confidence_interval_lower_b(covid_sample,level,column_to_look_at))
print('up to',confidence_interval_upper_b(covid_sample,level,column_to_look_at))

The confidence interval for AGE
is from 41.22
up to 47.31


In [14]:
np.mean(covid_data.column('AGE'))

41.809072312424007