## Data Source
- CaseDeathTest: $\href{https://data.chhs.ca.gov/dataset/covid-19-time-series-metrics-by-county-and-state/resource/046cdd2b-31e5-4d34-9ed3-b48cdbc4be7a}{\text{Statewide COVID-19 Cases Deaths Tests}}$
- Hospital: $\href{https://data.ca.gov/dataset/covid-19-hospital-data1/resource/0d9be83b-5027-41ff-97b2-6ca70238d778}{\text{Statewide Covid-19 Hospital County Data}}$
- Vaccination: $\href{https://data.ca.gov/dataset/covid-19-vaccine-progress-dashboard-data/resource/c020ef6b-2116-4775-b11d-9df2875096ab}{\text{Statewide COVID-19 Vaccines Administered By County}}$

Download date: 3/2/2022

## Version Control
- v1: joint the 3 original datasets
- v2: rearrange order(ascending time), remove trivial features
- v3(current): treated missing values

In [1]:
import numpy as np
import pandas as pd

In [2]:
CaseDeathTest = pd.read_csv("CaseDeathTest.csv")
Hospital = pd.read_csv("Hospital.csv")
Vaccination = pd.read_csv("Vaccination.csv")

In [3]:
# check if imported properly
'''print(CaseDeathTest.head())
print(Hospital.head())
print(Vaccination.head())'''

'print(CaseDeathTest.head())\nprint(Hospital.head())\nprint(Vaccination.head())'

In [4]:
# check original length
print("Length of CaseDeathTest", len(CaseDeathTest))
print("Length of Hospital", len(Hospital))
print("Length of Vaccination", len(Vaccination))  # shorter because already sifted out on ventura

Length of CaseDeathTest 46421
Length of Hospital 39365
Length of Vaccination 584


In [5]:
# print all possible traits
print(CaseDeathTest.columns,len(CaseDeathTest.columns))
print(Hospital.columns,len(Hospital.columns))
print(Vaccination.columns,len(Vaccination.columns))

Index(['date', 'area', 'area_type', 'population', 'cases', 'cumulative_cases',
       'deaths', 'cumulative_deaths', 'total_tests', 'cumulative_total_tests',
       'positive_tests', 'cumulative_positive_tests', 'reported_cases',
       'cumulative_reported_cases', 'reported_deaths',
       'cumulative_reported_deaths', 'reported_tests'],
      dtype='object') 17
Index(['county', 'todays_date', 'hospitalized_covid_confirmed_patients',
       'hospitalized_suspected_covid_patients', 'hospitalized_covid_patients',
       'all_hospital_beds', 'icu_covid_confirmed_patients',
       'icu_suspected_covid_patients', 'icu_available_beds'],
      dtype='object') 9
Index(['county', 'administered_date', 'total_doses', 'cumulative_total_doses',
       'pfizer_doses', 'cumulative_pfizer_doses', 'moderna_doses',
       'cumulative_moderna_doses', 'jj_doses', 'cumulative_jj_doses',
       'partially_vaccinated', 'total_partially_vaccinated',
       'fully_vaccinated', 'cumulative_fully_vaccinated', '

In [6]:
# select only cases from Ventura
CaseDeathTest = CaseDeathTest.loc[CaseDeathTest["area"] == "Ventura"]
Hospital = Hospital.loc[Hospital["county"] == "Ventura"]
Vaccination = Vaccination.loc[Vaccination["county"] == "Ventura"]  # sanity check

In [7]:
# check length again
print("Length of CaseDeathTest", len(CaseDeathTest))
print("Length of Hospital", len(Hospital))
print("Length of Vaccination", len(Vaccination))  # works perfectly

Length of CaseDeathTest 761
Length of Hospital 703
Length of Vaccination 584


In [8]:
# join the 3 DataFrame, now based on date
Hospital = Hospital.rename(columns = {"todays_date":"date"})  # unify key column name
Vaccination = Vaccination.rename(columns = {"administered_date":"date"})

# merge all three into one, joint on "date"
temp = Vaccination.merge(Hospital, on = "date")  # merge two at a time
Ventura = temp.merge(CaseDeathTest, on = "date")
Ventura.shape  # rid of 2 col of indices

(583, 43)

In [9]:
# sort data by date
Ventura["date"] = pd.to_datetime(Ventura["date"])
Ventura = Ventura.sort_values(by = "date")
# drop trait "population": population constant at 852747, not helpful
# drop trait "county_x", "county_y": only one county used
# drop trait "california_flag", "area_type": trivial
Ventura.drop(["population", "county_x", "county_y", "california_flag", "area_type"],\
             inplace = True, axis = 1)  # drop col 
Ventura

Unnamed: 0,date,total_doses,cumulative_total_doses,pfizer_doses,cumulative_pfizer_doses,moderna_doses,cumulative_moderna_doses,jj_doses,cumulative_jj_doses,partially_vaccinated,...,cumulative_deaths,total_tests,cumulative_total_tests,positive_tests,cumulative_positive_tests,reported_cases,cumulative_reported_cases,reported_deaths,cumulative_reported_deaths,reported_tests
0,2020-07-27,0,0,0,0,0,0,0,0,0,...,84.0,4172.0,152405,313.0,9446,154.0,6998.0,1.0,68.0,3963.0
1,2020-07-28,0,0,0,0,0,0,0,0,0,...,86.0,3074.0,155479,171.0,9617,47.0,7045.0,0.0,68.0,2221.0
90,2020-07-29,1,1,0,0,1,1,0,0,0,...,86.0,2449.0,157928,173.0,9790,116.0,7161.0,0.0,68.0,1903.0
2,2020-07-30,0,1,0,0,0,1,0,0,0,...,87.0,2768.0,160696,170.0,9960,143.0,7304.0,1.0,69.0,3886.0
91,2020-07-31,1,2,1,1,0,1,0,0,0,...,89.0,2788.0,163484,187.0,10147,217.0,7521.0,2.0,71.0,3249.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,2022-02-25,1404,1526767,753,814586,335,619189,16,48217,278,...,1412.0,3852.0,2798625,113.0,215547,0.0,171292.0,0.0,1411.0,
579,2022-02-26,760,1527527,435,815021,151,619340,8,48225,135,...,1412.0,1385.0,2800010,71.0,215618,0.0,171292.0,0.0,1411.0,
580,2022-02-27,358,1527885,214,815235,90,619430,4,48229,67,...,1412.0,776.0,2800786,29.0,215647,368.0,171660.0,12.0,1423.0,15026.0
581,2022-02-28,999,1528884,565,815800,237,619667,25,48254,145,...,1412.0,556.0,2801342,14.0,215661,-46.0,171614.0,0.0,1423.0,1808.0


 ## Treat Missing Values

In [11]:
# first, make sure numerical traits are recognized
notNum = ["date"]
for trait in Ventura.columns[Ventura.columns!="date"]:
    if Ventura[trait].dtypes != "int64":
        try:
            Ventura[trait] = pd.to_numeric(Ventura[trait])
        except Exception:
            notNum.append(trait)
print(notNum)  # keep track of non-numerical traits

['date', 'area']


In [12]:
print("Number of missing values before treatment: ", Ventura.isnull().values.sum())
for trait in Ventura.columns:
    if trait not in notNum:  # only consider numerical traits
        for i in range(len(Ventura[trait])):
            if pd.isnull(Ventura[trait].loc[i]):  # if missing data:
                preFind, posFind = False, False
                preInd, posInd = 1,1
                try:
                    while not preFind:
                        if not pd.isnull(Ventura[trait].loc[i-preInd]):  # valid value
                            pre = Ventura[trait].loc[i-preInd]
                            preFind = True
                        else:
                            preInd -= 1
                    while not posFind:
                        if not pd.isnull(Ventura[trait].loc[i+posInd]):  # valid value
                            pos = Ventura[trait].loc[i+posInd]
                            posFind = True
                        else:
                            posInd += 1
                except Exception:  # out of range or so
                    print("warning: check {}, {}".format(trait, i))
                    break
                Ventura[trait].loc[i] = np.mean([pre,pos])
                # print([pre,Ventura[trait].loc[i],pos])
                # print(trait): 69 reported_tests 

Number of missing values before treatment:  69


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [13]:
print("Number of missing values after treatment: ", Ventura.isnull().values.sum())

Number of missing values after treatment:  2


In [14]:
# export to csv file and check
Ventura.to_csv("JointData.csv", index = False)  # do not foget to mark version locally

## Select Traits
- Use Cases as ground truth(y)
    - CaseDeathTest
    - Total number of laboratory-confirmed COVID-19 cases with episode date on the provided date
- Traits I consider(daily, total):
    - CaseDeathTest
        - Deaths
            - covid related death
        - Total_Tests
            - Total number of COVID-19 molecular tests (PCR tests only)
            - Associated with specimen collection date
        - Positive_Tests
            - Total number of positive COVID-19 molecular tests(PCR test only)
            - Associated with specimen collection date
        - Reported_Tests
            - Total number of COVID-19 molecular tests reported to the California Department of Public Health on the provided date
    - Hospital
        - hospitalized_covid_confirmed_patients
            - The number of patients hospitalized in an inpatient bed who have laboratory-confirmed COVID
        - hospitalized_covid_patients
        - all_hospital_beds
        - icu_covid_confirmed_patients
        - icu_suspected_covid_patients
        - icu_available_beds
    - Vaccination
        - total_doses
        - pfizer_doses
        - moderna_doses
        - jj_doses
        - at_least_one_dose
        - partially_vaccinated
        - fully_vaccinated
        - booster_recip_count

In [15]:
# now select the traits we want
temp = Ventura[["date", "deaths", "total_tests", "positive_tests", "hospitalized_covid_confirmed_patients",
         "icu_covid_confirmed_patients","total_doses", "fully_vaccinated", "booster_recip_count", "cases"]]
temp

Unnamed: 0,date,deaths,total_tests,positive_tests,hospitalized_covid_confirmed_patients,icu_covid_confirmed_patients,total_doses,fully_vaccinated,booster_recip_count,cases
0,2020-07-27,2.0,4172.0,313.0,86.0,24.0,0,0,0,207.0
1,2020-07-28,2.0,3074.0,171.0,87.0,22.0,0,0,0,128.0
90,2020-07-29,0.0,2449.0,173.0,78.0,25.0,1,0,0,126.0
2,2020-07-30,1.0,2768.0,170.0,80.0,22.0,0,0,0,105.0
91,2020-07-31,2.0,2788.0,187.0,85.0,25.0,1,0,0,112.0
...,...,...,...,...,...,...,...,...,...,...
578,2022-02-25,0.0,3852.0,113.0,105.0,19.0,1404,347,732,66.0
579,2022-02-26,0.0,1385.0,71.0,92.0,17.0,760,231,374,39.0
580,2022-02-27,0.0,776.0,29.0,89.0,17.0,358,86,195,22.0
581,2022-02-28,0.0,556.0,14.0,86.0,11.0,999,264,567,11.0


In [None]:
# predict: icu bed in short or not tomorrow
# logistic regression, high dimension time series & deep learning
# LSTM
    # periodic, sequence data
# challenge
    # convolution
    # code or package?

    
# Regression(death, cases) vs Classification(hospital bed in short?)
# Next step:
    # Data(trait, trend)
    # Relative paper(algorithm)
        # keyword: deep learning, covid 