In [36]:
import pandas as pd
import numpy as np
import os
import datetime

In [37]:
import git
import sys
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir

In [38]:
#Death Data
NYT_DC = pd.read_csv(f"{homedir}/data/us/covid/nyt_us_counties_daily.csv")
NYT_DC = NYT_DC.drop(columns=['county','state']).sort_values(['fips','date']).reset_index(drop=True)
NYT_DC['fips'] = NYT_DC.fips.astype(int)
NYT_DC['date'] = pd.to_datetime(NYT_DC['date'])
NYT_DC['id'] = NYT_DC.fips.astype(str).str.cat(NYT_DC.date.astype(str), sep=', ')
JHU_DC = pd.read_csv(f"{homedir}/data/us/covid/JHU_daily_US.csv")
JHU_DC['FIPS'] = JHU_DC.FIPS.astype(int)
JHU_DC['Date'] = pd.to_datetime(JHU_DC['Date'])
JHU_DC['id'] = JHU_DC.FIPS.astype(str).str.cat(JHU_DC.Date.astype(str), sep=', ')
JHU_DC.loc[JHU_DC['Active'] < 0, 'Active'] *= -1


In [14]:
#Making a base dataframe to put all data into
County_List = list(set(NYT_DC.fips.unique()).union(set(JHU_DC.FIPS.unique())))
Date_List = list(set(NYT_DC.date.unique()).union(set(JHU_DC.Date.unique())))
#This creates a base dataframe that contains all pairs of FIPS codes with the valid dates given in Air_Qual
CL, DL = pd.core.reshape.util.cartesian_product([County_List, Date_List])
BaseFrame = pd.DataFrame(dict(FIPS=CL, Date=DL)).sort_values(['FIPS','Date']).reset_index(drop=True)
#BaseFrame.join(JHU_DC, )
BaseFrame['id'] = BaseFrame.FIPS.astype(str).str.cat(BaseFrame.Date.astype(str), sep=', ')

In [15]:
#DataFrame to contain Y values: I.E. number of deaths, cases per county per day, target of LSTM Model
Death_Daily = BaseFrame.join(JHU_DC.set_index('id'), on = 'id', how = 'left', lsuffix='', rsuffix='b')
Death_Daily = Death_Daily.join(NYT_DC.set_index('id'), on = 'id', how = 'left', lsuffix='x', rsuffix='y')
Death_Daily = Death_Daily.drop(columns= ['id','FIPSb','Dateb','fips','date'])
Death_Daily = Death_Daily.drop_duplicates(subset=['FIPS','Date'])

Death_Daily['Confirmed_Cases'] = np.nanmax(Death_Daily[['Confirmed','cases']].values, axis=1)
Death_Daily['Confirmed_Deaths'] = np.nanmax(Death_Daily[['Deaths','deaths']].values, axis=1)
Death_Daily = Death_Daily.drop(columns=['Confirmed','Deaths','cases','deaths'])
Death_Daily = Death_Daily.dropna(subset=['FIPS','Date'])
Death_Daily = Death_Daily.fillna(0)
Death_Daily = Death_Daily.sort_values(['FIPS','Date'])
Death_Daily.to_csv('Death_Daily.csv')

  import sys
  


In [17]:
#Standard Mobility DL data
Mobility = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/DL_Mobility.csv", index_col=0).reset_index(drop=True)
Mobility['fips'] = Mobility.fips.astype(int)
Mobility['date'] = pd.to_datetime(Mobility['date'])
Mobility['id'] = Mobility.fips.astype(str).str.cat(Mobility.date.astype(str), sep=', ')

Mobility.head()

Unnamed: 0,date,fips,samples,m50,m50_index,id
0,2020-03-01,1001,1703,7.194,49,"1001, 2020-03-01"
1,2020-03-02,1001,1829,14.587,100,"1001, 2020-03-02"
2,2020-03-03,1001,1840,13.865,95,"1001, 2020-03-03"
3,2020-03-04,1001,1900,13.88,95,"1001, 2020-03-04"
4,2020-03-05,1001,1825,14.659,100,"1001, 2020-03-05"


In [13]:
#Google Mobility Data
Google = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/google_new_filled.csv", index_col=0).reset_index(drop=True)
Google['FIPS'] = Google.FIPS.astype(int)
Google['Date'] = pd.to_datetime(Google['Date'])
Google['id'] = Google.FIPS.astype(str).str.cat(Google.Date.astype(str), sep=', ')
Google.head()

Unnamed: 0,FIPS,Date,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,residential_percent_change_from_baseline,retail_and_recreation_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,id
0,1001,2020-02-15,7.0,5.522819,-1.527507,5.0,6.364064,-4.0,"1001, 2020-02-15"
1,1001,2020-02-16,1.0,-23.0,1.0,0.0,-5.888442,-4.0,"1001, 2020-02-16"
2,1001,2020-02-17,0.0,3.275069,5.0,8.0,-2.629827,-27.0,"1001, 2020-02-17"
3,1001,2020-02-18,0.0,-4.0,0.0,-2.0,-7.607523,2.0,"1001, 2020-02-18"
4,1001,2020-02-19,0.0,5.016194,0.0,-2.0,-1.303654,2.0,"1001, 2020-02-19"


In [12]:
#Air Quality Data
Air_Qual = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/Air_Qual_Filled.csv", index_col=0).reset_index(drop=True)
Air_Qual['FIPS'] = Air_Qual.FIPS.astype(int)
Air_Qual['Date'] = pd.to_datetime(Air_Qual['Date'])
Air_Qual['id'] = Air_Qual.FIPS.astype(str).str.cat(Air_Qual.Date.astype(str), sep=', ')

Air_Qual.head()

Unnamed: 0,FIPS,Date,OZONE_AQI,PM10_AQI,PM25_AQI,NO2_AQI,CO,SO2,id
0,1001,2020-01-01,16.422424,10.125737,54.5,7.46802,0.287662,0.428571,"1001, 2020-01-01"
1,1001,2020-01-02,22.169518,10.668188,54.8125,13.391453,0.32602,0.583333,"1001, 2020-01-02"
2,1001,2020-01-03,17.153856,10.328083,29.875,12.347675,0.353808,0.133333,"1001, 2020-01-03"
3,1001,2020-01-04,25.211669,10.22094,20.3125,4.207851,0.214592,0.0,"1001, 2020-01-04"
4,1001,2020-01-05,18.645173,9.011376,35.875,6.152528,0.252093,0.066667,"1001, 2020-01-05"


In [27]:
#DataFrame to contain X values, Factors to help predict future death/case counts
Factors_Daily = BaseFrame
Factors_Daily = Factors_Daily.join(Air_Qual.set_index('id'), how='left', on='id',lsuffix = '', rsuffix = '_X')
Factors_Daily = Factors_Daily.join(Google.set_index('id'), how='left', on='id',lsuffix = '', rsuffix = '_X')
Factors_Daily = Factors_Daily.join(Mobility.set_index('id'), how='left', on='id')
Factors_Daily = Factors_Daily.drop(columns=['FIPS_X','Date_X','date','fips']).sort_values(['FIPS','Date'])
Factors_Daily.to_csv('Factors_Daily.csv')

In [28]:
Factors_Daily

Unnamed: 0,FIPS,Date,id,OZONE_AQI,PM10_AQI,PM25_AQI,NO2_AQI,CO,SO2,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,residential_percent_change_from_baseline,retail_and_recreation_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,samples,m50,m50_index
0,1001,2020-01-21,"1001, 2020-01-21",21.424662,11.462195,48.4000,9.718369,0.223754,0.133333,,,,,,,,,
1,1001,2020-01-22,"1001, 2020-01-22",21.995240,12.545221,48.3750,15.605490,0.302779,4.307692,,,,,,,,,
2,1001,2020-01-23,"1001, 2020-01-23",23.186707,12.136450,53.5000,13.181550,0.269936,2.727273,,,,,,,,,
3,1001,2020-01-24,"1001, 2020-01-24",19.191761,11.321569,47.4375,10.760020,0.287658,0.166667,,,,,,,,,
4,1001,2020-01-25,"1001, 2020-01-25",18.930764,10.056742,30.7500,6.549899,0.297778,0.000000,,,,,,,,,
5,1001,2020-01-26,"1001, 2020-01-26",11.330019,13.064305,48.7500,8.331218,0.240231,0.285714,,,,,,,,,
6,1001,2020-01-27,"1001, 2020-01-27",20.389567,11.228655,36.5000,11.223603,0.300864,0.000000,,,,,,,,,
7,1001,2020-01-28,"1001, 2020-01-28",14.890866,17.600639,46.0625,12.010569,0.309148,1.357143,,,,,,,,,
8,1001,2020-01-29,"1001, 2020-01-29",18.577054,15.671186,45.4375,15.163321,0.389702,0.923077,,,,,,,,,
9,1001,2020-01-30,"1001, 2020-01-30",19.472991,20.119365,43.5000,10.967973,0.320775,4.214286,,,,,,,,,
