In [None]:
import pandas as pd
import numpy as np
import csv

from datetime import datetime, date
import matplotlib.pyplot as plt

# Add signals to the dataset by uncommenting them
input_features = [
    'pct_cmnty_cli',
    #'pct_cli',
    #'pct_ili',
    #'pct_cli_anosmia_ageusia',
    #'pct_hh_cli',
    #'pct_hh_fever',
    #'pct_hh_sore_throat',
    #'pct_hh_cough',
    #'pct_hh_shortness_of_breath',
    #'pct_hh_difficulty_breathing',
    #'pct_self_fever',
    #'pct_self_cough',
    #'pct_self_shortness_of_breath',
    #'pct_self_difficulty_breathing',
    #'pct_self_tiredness_or_exhaustion',
    #'pct_self_nasal_congestion',
    #'pct_self_runny_nose',
    #'pct_self_muscle_joint_aches',
    #'pct_self_sore_throat',
    #'pct_self_persistent_pain_pressure_in_chest',
    #'pct_self_nausea_vomiting',
    #'pct_self_diarrhea',
    #'pct_self_anosmia_ageusia',
    #'pct_self_other',
    #'pct_self_none_of_above',
    #'pct_self_multiple_symptoms',
    #'pct_tested_and_positive',
    #'pct_worked_outside_home',
    #'pct_avoid_contact_all_or_most_time',
    #'pct_contact_covid_positive'
]
#input_features = []

In [None]:
# Import cases data
newcases = (pd.read_csv("time_series_cases.csv")).iloc[:,1:]
newcases.iloc[:,2:] = np.array(newcases.iloc[:,2:]) - np.array(newcases.iloc[:,1:-1])
indnames = newcases.pop("FIPS")
newcases = newcases.rename(index = indnames,
                           columns = lambda x: datetime.strptime(x, "%m/%d/%y").strftime("%Y-%m-%d")).T
newcases = newcases.rolling(7).mean().iloc[6:,:].T

# Import R values, restrict to list of counties in fips.txt
R = pd.read_csv("RValues.csv", index_col='fips')
R.pop("Jurisdiction")
fips = pd.read_table("fips.txt", header = None)[0].to_numpy()
idx = [x in fips for x in R.index]
R = R.loc[idx,:]
R = R.sort_index()

# The index for newcases is the fips value
idx = [x in fips for x in newcases.index]
newcases = newcases.loc[idx,:]
newcases = newcases.sort_index()
newcases_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in newcases.columns]

# Import the CMU data, limiting to entries with both gender and age_bucket as 'overall'
cmudf = pd.read_csv("overall-county-smoothed.csv").query("gender=='overall' & age_bucket=='overall'")

# Set the index as the identifier for the extracted features
def extract_feature(col_name):
    df = pd.pivot_table(cmudf, values=col_name, index='fips', columns='date',
                        fill_value = 0, aggfunc = np.mean)
    idx = [x in fips for x in df.index]
    df = df.loc[idx,:]
    df = df.sort_index()
    return df

cli = extract_feature("smoothed_pct_cli")

R_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in R.columns[1:]]
cli_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in cli.columns]
R = R.loc[:,[False,*[np.min(cli_dates) <= d <= np.max(cli_dates) for d in R_dates]]]

# Ensure each dataset represents the same time period
R_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in R.columns]
newcases = newcases.loc[:,[*[np.min(cli_dates) <= d <= np.max(cli_dates) for d in newcases_dates]]]
newcases_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in newcases.columns]
newcases = newcases.loc[:,[*[np.min(R_dates) <= d <= np.max(R_dates) for d in newcases_dates]]]

# Convert from Pandas to Numpy data structures
np_newcases = newcases.to_numpy().clip(min=0)
np_newcases = np_newcases.reshape(np_newcases.shape[0], np_newcases.shape[1], 1)

np_inp = np.ndarray([*R.shape, 0], dtype = np.float32)
# Extract and format the features identified in cell 1
for name in input_features:
    feature = extract_feature(f"smoothed_{name}_weighted")
    feature_dates = [datetime.strptime(x,"%Y-%m-%d").date() for x in feature.columns]
    feature = feature.loc[:,[np.min(R_dates) <= d <= np.max(R_dates) for d in feature_dates]]
    np_inp = np.concatenate((np_inp, feature.to_numpy().reshape([*feature.shape,1])), axis=2)
np_inp = np.concatenate((np_inp, np_newcases), axis=2)
np_R = R.to_numpy()

In [None]:
import h5py as h5
num_datasets = 100
dataset_name = "dataset.h5"
num_in, num_out, offset = 21, 21, 5

def construct_set(inc_rows):
    dataset = np.ndarray((0,num_in,np_inp.shape[2]), dtype = np.float32)
    labels = np.ndarray((0,num_out), dtype = np.float32)
    surges = np.ndarray((0,), dtype = np.int8)
    date0 = datetime.strptime(newcases.columns[0], "%Y-%m-%d")
    for days in range(0, np_inp.shape[1]-num_in-num_out,offset):
        # indices for input data
        j = 1 + np.where([days <= (datetime.strptime(d, "%Y-%m-%d")-date0).days < (days+num_in)
                          for d in R.columns])[0]
        for k in inc_rows:
            newrow = np.array(np_inp[k,j], dtype = np.float32).reshape((1,num_in,np_inp.shape[2]))
            scale = newrow[:,:,-1].max()
            if scale <= 0.0:
                print(scale)
            newrow[:,:,-1] = (newrow[:,:,-1] / scale) * 100
            dataset = np.append(dataset, newrow, axis = 0)
        # indices for results
        j = 1 + np.where([(days+num_in) <= (datetime.strptime(d, "%Y-%m-%d")-date0).days < (days+num_in+num_out)
                          for d in R.columns[1:]])[0]
        for k in inc_rows:
            newrow = np.array(np_R[k,j], dtype = np.float32).reshape((1,labels.shape[1]))
            labels = np.append(labels, newrow, axis = 0)
            prev_week = (np.sum(newcases.iloc[i,j[0]-7:j[0]]))
            cur_week = (np.sum(newcases.iloc[i,j[0]:j[0]+7]))
            # detect surges when case are above 20 per week
            surges = np.append(surges, [1 if prev_week > 20 and cur_week >= 1.3*prev_week else 0], axis = 0)
    return dataset, labels, surges
        
with h5.File(dataset_name,"w") as f:
    # Run generate datasets num_datasets times, write their results to an h5 file
    for i in range(0, num_datasets):
        group_name = "group" + str(i)
        f.create_group(group_name)

        # Generate which rows will be part of the random test/train sets
        test_rows = []

        while len(test_rows) < .1 * np_inp.shape[0]:
            j = int(round(np.random.rand(1)[0] * np_inp.shape[0]))
            if j not in test_rows:
                test_rows.append(j)
        train_rows = np.where([j not in test_rows for j in range(0, np_inp.shape[0])])[0]
        test_rows = np.where([j in test_rows for j in range(0, np_inp.shape[0])])[0]

        # Generate the test sets
        train_dataset, train_labels, train_surges = construct_set(train_rows)
        dataset_scales = train_dataset.max(axis=0).max(axis=0).reshape((1,1,train_dataset.shape[2]))
        label_scale = train_labels.max()
        train_dataset /= 100
        train_labels /= label_scale

        test_dataset, test_labels, test_surges = construct_set(test_rows)
        test_dataset /= 100
        test_labels /= label_scale
        
        file_tr_dataset = group_name + "/train_dataset"
        file_tr_labels = group_name + "/train_labels"
        file_te_dataset = group_name + "/test_dataset"
        file_te_labels = group_name + "/test_labels"
        file_scale = group_name + "/label_scale"
        file_te_surge = group_name + "/test_surge"
        
        f[file_tr_dataset] = train_dataset
        f[file_tr_labels] = train_labels
        f[file_te_dataset] = test_dataset
        f[file_te_labels] = test_labels
        f[file_scale] = label_scale
        f[file_te_surge] = test_surges

    # END FOR EACH 0-num_datasets