# Importing our modules

In [1]:
import numpy as np
from datascience import *

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# Constructing Confidence Intervals

In [2]:
covid_data = Table.read_table("Covid Data_subset.csv")
covid_data

USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
2,1,1,1,3/5/2020,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
2,1,1,1,12/6/2020,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
2,1,1,1,9999-99-99,97,2,64,2,2,2,2,2,2,2,2,2,2,2,3,97
2,1,1,2,9999-99-99,2,2,25,2,2,2,2,2,2,2,2,2,2,2,3,2
2,1,2,2,9999-99-99,2,2,30,97,2,2,2,2,2,2,2,2,2,2,3,2
2,1,1,1,9999-99-99,97,2,23,2,2,2,2,2,2,2,2,2,2,2,3,97
2,1,2,1,9999-99-99,97,2,54,97,2,2,2,2,2,2,2,2,2,2,3,97
2,1,2,1,9999-99-99,97,2,30,97,2,2,2,2,2,2,2,2,2,2,3,97
2,1,1,1,9999-99-99,97,2,38,2,2,2,2,2,2,2,2,2,1,2,3,97
2,1,2,1,9999-99-99,97,2,49,97,2,2,2,2,2,2,2,2,2,2,3,97


In [3]:
#Let's consider this data set to be our population, and calculate the death rate of patients with covid.  
#Those with an impossible date for death can be assumed to live in this data set
100*(1 - (covid_data.where('DATE_DIED','9999-99-99').num_rows / covid_data.num_rows))

7.337672555611185

### Let's play out a scenario

Suppose that we are still in the early stages of the covid pandemic. You work for a hospital that has seen 100 patients so far. You want to know how deadly this illness is to the population. 

In [4]:
#we proceed by taking a sample of 100 from the covid table. Let's assume that these are the 100 patients we saw.
covid_sample = covid_data.sample(100,with_replacement = False)
covid_sample

USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
1,12,1,1,9999-99-99,97,2,44,2,2,2,2,2,2,2,2,1,2,2,3,97
2,12,1,1,9999-99-99,97,2,33,2,2,2,2,2,2,2,2,2,2,1,7,97
1,12,2,1,9999-99-99,97,1,51,97,2,2,2,2,2,2,2,2,2,2,3,97
1,4,1,2,14/06/2020,2,1,67,2,1,2,2,2,1,2,2,2,2,2,3,2
1,12,1,2,11/7/2020,2,1,68,2,1,1,2,2,2,2,2,2,2,2,7,2
1,12,2,1,9999-99-99,97,2,48,97,2,2,2,2,2,2,2,2,2,2,6,97
1,12,1,2,9999-99-99,2,2,33,2,2,2,2,2,2,2,2,2,2,2,7,2
1,12,2,1,9999-99-99,97,2,24,97,2,2,2,2,2,2,2,2,2,2,7,97
2,6,2,1,9999-99-99,97,1,51,97,2,2,2,2,2,2,2,2,2,2,7,97
2,12,2,1,9999-99-99,97,2,60,97,2,2,2,2,2,2,2,2,2,2,3,97


Then we can define the following functions to construct an upper and lower bound for the confidence interval

In [28]:
def confidence_interval_for_death_rate(level):
    stats = make_array()  #creating an empty array to store all of my bootstrapped sample proportions
    
    
    for i in np.arange(10000):
        #covid_sample.sample() is where I bootstrap re-sample the data I have above
        stat = 100*(1 - ((covid_sample.sample()).where('DATE_DIED','9999-99-99').num_rows / covid_sample.num_rows))
        #the line above takes the re-sample and calculates the percentage of mortality 
        
        stats = np.append(stats,stat)
        # store the statistic into the array above 
        
    lower_bound = str(percentile((100 - level)/2,stats))
    upper_bound = str(percentile( 100 - ((100 - level)/2) , stats))

    return 'We are '+ str(level)+'% confident that the true death rate is between '+lower_bound+' and '+upper_bound

In [29]:
confidence_interval_for_death_rate(95)

'We are 95% confident that the true death rate is between 7.0 and 21.0'

In [13]:
def confidence_interval_lower_b(tbl,level,column_label):
    stats = make_array()
    for i in np.arange(10000):
        stat = np.mean(tbl.sample().column(column_label))
        stats = np.append(stats,stat)
    return percentile( (100 - level)/2 , stats)

def confidence_interval_upper_b(tbl,level,column_label):
    stats = make_array()
    for i in np.arange(10000):
        stat = np.mean(tbl.sample().column(column_label))
        stats = np.append(stats,stat)
    return percentile( 100 - ((100 - level)/2) , stats)

In [15]:
column_to_look_at = 'AGE'
level = 95
print('The confidence interval for',column_to_look_at) 
print('is from',confidence_interval_lower_b(covid_sample,level,column_to_look_at))
print('up to',confidence_interval_upper_b(covid_sample,level,column_to_look_at))

The confidence interval for AGE
is from 37.58
up to 44.35


In [14]:
np.mean(covid_data.column('AGE'))

41.809072312424007