# load data

In [100]:
import requests
import pandas as pd
import numpy as np

def extract_data():
    response = requests.get("https://api.covidtracking.com/v1/us/daily.csv")

    # export the data to a csv file
    with open('covid_data.csv', 'wb') as f:
        f.write(response.content)
        f.close()


def load_data():
    extract_data()
    return pd.read_csv('covid_data.csv')

def pre_process(df: pd.DataFrame):

    df = df[df.states == 56]
    df = df.drop(columns=["recovered", "lastModified", "states", "dateChecked", "total", "posNeg", "hospitalized"])

    df = df[["hash", "date", "positive", "negative", "positiveIncrease", "negativeIncrease", "pending", "hospitalizedCurrently", "hospitalizedIncrease", "hospitalizedCumulative",
             "inIcuCurrently", "inIcuCumulative", "onVentilatorCurrently", "onVentilatorCumulative", "totalTestResults", "totalTestResultsIncrease",
             "death", "deathIncrease" 
             ]]
    
    df.rename(columns={
        "negative": "pcr_test_negative",
        "positive": "pcr_test_positive"
    }, inplace=True)
    
    return df



df = load_data()
df = pre_process(df)

In [101]:
df.iloc[0:5, 1:10]

Unnamed: 0,date,pcr_test_positive,pcr_test_negative,positiveIncrease,negativeIncrease,pending,hospitalizedCurrently,hospitalizedIncrease,hospitalizedCumulative
0,20210307,28756489.0,74582825.0,41835,131835,11808.0,40199.0,726,776361.0
1,20210306,28714654.0,74450990.0,60015,143835,11783.0,41401.0,503,775635.0
2,20210305,28654639.0,74307155.0,68787,271917,12213.0,42541.0,2781,775132.0
3,20210304,28585852.0,74035238.0,65487,177957,12405.0,44172.0,1530,772351.0
4,20210303,28520365.0,73857281.0,66836,267001,11778.0,45462.0,2172,770821.0


In [102]:
df.iloc[0:5, 10:]

Unnamed: 0,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,totalTestResults,totalTestResultsIncrease,death,deathIncrease
0,8134.0,45475.0,2802.0,4281.0,363825123,1170059,515151.0,842
1,8409.0,45453.0,2811.0,4280.0,362655064,1430992,514309.0,1680
2,8634.0,45373.0,2889.0,4275.0,361224072,1744417,512629.0,2221
3,8970.0,45293.0,2973.0,4267.0,359479655,1590984,510408.0,1743
4,9359.0,45214.0,3094.0,4260.0,357888671,1406795,508665.0,2449


Notes:

1. hospitalizedCurrently, hospitalizedIncrease and hospitalizedCumulative measure the number of people that admited themselves to a hospital because of COVID.

2. inIcuCurrently, inIcuCumulative measure the number of people that admited themselves(hospitilized) PLUS went into ICU.(careful, this is not the most accurate measure)

Quote from the data source:
"Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients in the ICU with confirmed or suspected COVID-19 cases."


3. onVentilatorCurrently, onVentilatorCumulative measure the number of people that admitted themselves(hospitilized) PLUS needed a ventilator

In [None]:
import requests
import pandas as pd
import numpy as np

from google.cloud import bigquery

client = bigquery()

client.

In [104]:
df = pd.read_csv('covid_data.csv')

df = df[df.states == 56]
df = df.drop(columns=["recovered", "lastModified", "states", "dateChecked", "total", "posNeg", "hospitalized"])

df = df[["hash", "date", "positive", "negative", "positiveIncrease", "negativeIncrease", "pending", "hospitalizedCurrently", "hospitalizedIncrease", "hospitalizedCumulative",
            "inIcuCurrently", "inIcuCumulative", "onVentilatorCurrently", "onVentilatorCumulative", "totalTestResults", "totalTestResultsIncrease",
            "death", "deathIncrease" 
            ]]

df.rename(columns={
    "negative": "pcr_test_negative",
    "positive": "pcr_test_positive"
}, inplace=True)

In [122]:

df["date"] = df.date.astype('str')
df["date"] = pd.to_datetime(df.date.str[0:4]+'-'+df.date.str[4:6]+'-'+df.date.str[6:8])

## Columns Descriptions:

Fields

date
    Field type: integer

    Date

    Date on which data was collected by The COVID Tracking Project.
dateChecked
    Field type: string

    Deprecated. This is an old label for lastUpdateEt.
death
    Field type: integer

    Deaths (confirmed and probable)

    Total fatalities with confirmed OR probable COVID-19 case diagnosis (per the expanded CSTE case definition of April 5th, 2020 approved by the CDC). In some states, these individuals must also have COVID-19 listed on the death certificate to count as a COVID-19 death. When states post multiple numbers for fatalities, the metric includes only deaths with COVID-19 listed on the death certificate, unless deaths among cases is a more reliable metric in the state.
    Returns
    null
    if no data is available
deathIncrease
    Field type: integer

    New deaths

    Daily increase in death, calculated from the previous day’s value.
    Returns
    null
    if no data is available
hash
    Field type: string

    A hash for this record
hospitalized
    Field type: integer

    Deprecated. Old label for hospitalizedCumulative.
    Returns
    null
    if no data is available
hospitalizedCumulative
    Field type: integer

    Cumulative hospitalized/Ever hospitalized

    Total number of individuals who have ever been hospitalized with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients hospitalized with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
hospitalizedCurrently
    Field type: integer

    Currently hospitalized/Now hospitalized

    Individuals who are currently hospitalized with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients hospitalized with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
hospitalizedIncrease
    Field type: integer

    New total hospitalizations

    Daily increase in hospitalizedCumulative, calculated from the previous day’s value.
    Returns
    null
    if no data is available
inIcuCumulative
    Field type: integer

    Cumulative in ICU/Ever in ICU

    Total number of individuals who have ever been hospitalized in the Intensive Care Unit with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients in the ICU with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
inIcuCurrently
    Field type: integer

    Currently in ICU/Now in ICU

    Individuals who are currently hospitalized in the Intensive Care Unit with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients in the ICU with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
lastModified
    Field type: string

    Deprecated. Old label for lastUpdateET.
negative
    Field type: integer

    Negative PCR tests (people)

    Total number of unique people with a completed PCR test that returns negative. For states / territories that do not report this number directly, we compute it using one of several methods, depending on which data points the state provides. Due to complex reporting procedures, this number might be mixing units and therefore, at best, it should only be considered an estimate of the number of people with a completed PCR test that return negative.
    Returns
    null
    if no data is available
negativeIncrease
    Field type: integer

    Increase in negative computed by subtracting the value of negative for the previous day from the value for negative from the current day.
    Returns
    null
    if no data is available
onVentilatorCumulative
    Field type: integer

    Cumulative on ventilator/Ever on ventilator

    Total number of individuals who have ever been hospitalized under advanced ventilation with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients on ventilation with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
onVentilatorCurrently
    Field type: integer

    Currently on ventilator/Now on ventilator

    Individuals who are currently hospitalized under advanced ventilation with COVID-19. Definitions vary by state / territory, and it is not always clear whether pediatric patients are included in this metric. Where possible, we report patients on ventilation with confirmed or suspected COVID-19 cases.
    Returns
    null
    if no data is available
pending
    Field type: integer

    Pending

    Total number of viral tests that have not been completed as reported by the state or territory.
    Returns
    null
    if no data is available
posNeg
    Field type: integer

    Deprecated. Computed by adding positive and negative values.
    Returns
    null
    if no data is available
positive
    Field type: integer

    Cases (confirmed plus probable)

    Total number of confirmed plus probable cases of COVID-19 reported by the state or territory, ideally per the August 5, 2020 CSTE case definition. Some states are following the older April 5th, 2020 CSTE case definition or using their own custom definitions. Not all states and territories report probable cases. If a state is not reporting probable cases, this field will just represent confirmed cases.
    Returns
    null
    if no data is available
positiveIncrease
    Field type: integer

    New cases

    The daily increase in API field positive, which measures Cases (confirmed plus probable) calculated based on the previous day’s value.
    Returns
    null
    if no data is available
recovered
    Field type: integer

    Recovered

    Total number of people that are identified as recovered from COVID-19. States provide very disparate definitions on what constitutes a “recovered” COVID-19 case. Types of “recovered” cases include those who are discharged from hospitals, released from isolation after meeting CDC guidance on symptoms cessation, or those who have not been identified as fatalities after a number of days (30 or more) post disease onset. Specifics vary for each state or territory.
    Returns
    null
    if no data is available
states
    Field type: integer

    States

    Only available in national records. The number of states and territories included in the US dataset for this day.
total
    Field type: integer

    Deprecated. Computed by adding positive, negative, and pending values.
    Returns
    null
    if no data is available
totalTestResults
    Field type: integer

    Total test results

    At the national level, this metric is a summary statistic which, because of the variation in test reporting methods, is at best an estimate of US viral (PCR) testing. Some states/territories report tests in units of test encounters, some report tests in units of specimens, and some report tests in units of unique people. Moreover, some jurisdictions include antigen tests in their total test counts without reporting a separate total of viral (PCR) tests. Therefore, this value is an aggregate calculation of heterogeneous figures. Please consult each state or territory’s individual data page to see whether that jurisdiction lumps antigen and PCR tests together and to see what units that jurisdiction uses for test reporting.

    In most states, the totalTestResults field is currently computed by adding positive and negative values because, historically, some states do not report totals, and to work around different reporting cadences for cases and tests. In Colorado, Delaware, the District of Columbia, Florida, Hawaii, Minnesota, Nevada, New York, North Dakota, Pennsylvania, Rhode Island, Virginia, Washington, and Wisconsin, where reliable testing encounters figures are available with a complete time series, we directly report those figures in this field. In Alaska, America Samoa, Arizona, Arkansas, California, Connecticut, Georgia, Illinois, Indiana, Kentucky, Maine, Maryland, Massachusetts, Michigan, Missouri, Montana, Nebraska, New Hampshire, New Mexico, North Carolina, Ohio, Oregon, South Dakota, Tennessee, Texas, Utah, Vermont, West Virginia, and Wyoming, where reliable specimens figures are available with a complete time series, we directly report those figures in this field. In Alabama and Idaho where reliable unique people figures are available with a complete time series, we directly report those figures in this field. We are in the process of switching all states over to use directly reported total figures, using a policy of preferring testing encounters, specimens, and people, in that order.
    Returns
    null
    if no data is available
totalTestResultsIncrease
    Field type: integer

    New tests

    Daily increase in totalTestResults, calculated from the previous day’s value. This calculation includes all the caveats associated with Total tests/totalTestResults, and we recommend against using it at the state/territory level.
    Returns
    null
    if no data is available