In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta

NYT_URL = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
NORCAL_COUNTIES = ['Alameda', 'Contra Costa', 'Fresno', 'San Francisco', 'Merced', 'San Mateo', 'Santa Clara', 'Santa Cruz', 'Tulare']
SOCAL_COUNTIES = ['Imperial', 'Kern', 'Los Angeles', 'Orange', 'Riverside', 'San Bernardino', 'San Diego', 'Santa Barbara', 'San Luis Obispo', 'Ventura']
ALL_COUNTIES = NORCAL_COUNTIES + SOCAL_COUNTIES
FIRST_WEEK = 9

In [2]:
# Read in the dataframe and filter for selected CA counties
chunks = pd.read_csv(NYT_URL, chunksize=150000)
df = pd.concat(chunks)

df = df[['date','county', 'state', 'cases', 'deaths']]
df = df[df['state'] == 'California']
df = df.loc[np.isin(df['county'], ALL_COUNTIES)]

In [3]:
# Convert 'date' column to a datetime object and get all weeks >= first week
df['date'] = pd.to_datetime(df['date'])
df['week number'] = df['date'].dt.week
df = df[df['week number'] >= FIRST_WEEK]

In [4]:
# Find the date covid was first being tracked by the NYT (from the chosen counties)
start_covid_date = df.loc[df['week number'] == FIRST_WEEK]['date'].iloc[0]

In [5]:
# Match date to week number. Ex. Week 1 = 3/10, Week 2 = 3/17
prev_week_number = df['week number'].iloc[0]
week_numbers = df['week number'].unique()
df['start of week date'] = start_covid_date - timedelta(days=start_covid_date.weekday())

for week in week_numbers:
    df['start of week date'].values[df['week number'].values == week] = df[df['week number'].values == prev_week_number]['start of week date'].iloc[0] + timedelta(days = 7)
    prev_week_number = week

In [6]:
# Get the total number of cases and deaths for each week for each selected county
# max() is used instead of sum() because the NYT reports the total number of cases and deaths instead of 
# individual cases
df = df.groupby(['week number', 'county', 'state', 
                 'start of week date'], as_index=False)['cases', 'deaths'].max()

In [7]:
# Write parsed df to a CSV
df.reset_index(drop=True, inplace=True)
df = df[['week number', 'county', 'cases', 'deaths', 'start of week date']]
df.rename(columns={'week number':'Week', 'county':'County Name', 'cases':'Cases', 'deaths':'Deaths', 'start of week date': 'Start of Week Date'}, inplace=True)
df.sort_values(by=['County Name', 'Week'], ascending=True, inplace=True)
df.to_csv('../parsed-csvs/covid-19-cases-deaths.csv')