In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from libs.CovidDatasets import JHUDataset as LegacyJHUDataset
from libs.datasets import jhu_dataset
from libs.datasets import cds_dataset
from libs.datasets import nytimes_dataset
from libs.datasets import timeseries
from libs.datasets import AggregationLevel
from libs.datasets import dataset_utils
from libs import build_params
from matplotlib import pyplot
import run
import matplotlib.dates
pd.set_option('display.max_rows', 4000)

In [None]:
jhu_data = jhu_timeseries_dataset.JHUTimeseriesData.build_from_local_github()
timeseries_data = timeseries.TimeseriesDataset.from_source(jhu_data)
jhu_usa_data = timeseries_data.get_subset(country='USA', after='2020-03-01')


In [None]:
jhu_usa_data.get_aggregation_level(AggregationLevel.COUNTY).get_subset(state='MA')

In [None]:
nyt_data = nytimes_dataset.NYTimesTimeseriesData.build_from_url()
data = nyt_data.to_common(county_only=True)
nyt_usa_data = data.get_country('USA').get_date(after='2020-03-01').data


In [None]:
cds_data = cds_dataset.CDSTimeseriesData.build_from_local_github()
data = cds_data.to_common(county_only=True)
cds_usa_data = data.get_country('USA').get_date(after='2020-03-01').data


In [None]:
def get_cds_ny_data(data):
    ny_data = data[data.state == 'NY']
    ny_data = ny_data[(data.county == 'New York') | (data.city == 'New York City')]
    return ny_data

get_cds_ny_data(cds_data.data)[['date', 'city', 'county', 'cases']].sort_values('date')

In [None]:
cds_data = cds_dataset.CDSTimeseriesData.build_from_local_github()   
jhu_data = jhu_timeseries_dataset.JHUTimeseriesData.build_from_local_github()
aggregate_county_data = dataset_utils.build_aggregate_county_data_frame(jhu_data, cds_data)

In [None]:
# Plot all data
all_data = pd.concat([cds_usa_data, jhu_usa_data, nyt_usa_data])
dataset_utils.plot_grouped_data(aggregate_data, ['date', 'source', 'aggregate_level'], series=['source', 'aggregate_level'], values='cases')

In [None]:
def get_jhu_post_321_data_without_county():
    jhu_data = jhu_timeseries_dataset.JHUTimeseriesData.build_from_local_github()
    data = jhu_data.to_common()
    jhu_usa_data = data.get_country('USA').get_date(after='2020-03-21').data
    jhu_usa_data[jhu_usa_data.county.isnull()].groupby('state').sum()['cases'].plot()
    
    
def get_county_name_intersection_summary(jhu_usa_data, cds_usa_data):
    jhu_counties = set(jhu_usa_data.county.unique().tolist())
    cds_counties = set(cds_usa_data.county.dropna().unique().tolist())

    num_intersect = len(cds_counties.intersection(jhu_counties))
    num_jhu_counties = len(jhu_counties)
    num_cds_counties = len(cds_counties)
    missing_cds_counties = cds_counties.difference(jhu_counties)
    total_cases_missing = cds_usa_data.set_index('county').loc[list(missing_cds_counties)].cases.sum())
    total_cases = cds_usa_data.cases.sum())
    
    num_missing_cds = len(missing_cds_counties)
    print(f"Total JHU: {len(jhu_counties)}")
    print(f"Total CDS: {len(cds_counties)}")
    print(f"Intersection: {num_intersect}")
    print(f"Num Missing CDS in JHU: {num_missing_cds}")

In [None]:
get_county_name_intersection_summary(jhu_usa_data, cds_usa_data)


In [None]:

def get_aggregate_county_data(data, by_state=False, by_date=False):
    group_fields = [data.county.isnull()]
    if by_date:
        group_fields.append('date')
    if by_state:
        group_fields.append('state')

    return data.groupby(group_fields).sum().reset_index()
    
def get_county_delta(
    data, 
    metric='cases', 
    by_state=False, 
    by_date=False
):
    results = get_aggregate_county_data(data, by_state=by_state, by_date=by_date)
    index = ['county']
    if by_state:
        index.append('state')
    if by_date:
        index.append('date')
    if index:
        results = results.set_index(index)
    
    return results
    # no_county = results.loc[[:, :, True]]
    county = results[results.county == True]
    delta = (no_county[metric] - county[metric])
    results['delta'] = delta
    return results

def plot_deltas(data, non_zero=False):
    if non_zero:
        data = data[data.delta != 0]
    data[data.county==False].delta.plot(kind='bar', figsize=(15, 15))
        

results = get_county_delta(jhu_usa_data, by_date=True, by_state=False)
results
# plot_deltas(results, non_zero=True)
# results['delta'] = (results.loc[True] - results.loc[False])['cases']


In [None]:
cds_data = cds_dataset.CDSTimeseriesData.build_from_local_github()
data = cds_data.to_common()
cds_usa_data = data.get_country('USA').get_date(after='2020-03-01').data

cds_usa_data = cds_usa_data.set_index(['date', 'country', 'state', 'county', 'aggregate_level', 'source'])
dataset_utils.check_index_values_are_unique(cds_usa_data)

In [None]:
state = cds_usa_data.query('aggregate_level == "state"')
county = cds_usa_data.query('aggregate_level == "county"')

all_data, matching, not_matching = dataset_utils.compare_datasets(
    state, county, ['date', 'state'], values='cases'
)
not_matching.groupby(['state']).plot(kind='bar', y='delta_ratio', figsize=(10, 7))


# county_level = state_aggregated[state_aggregated.aggregate_level == 'county']
# state_level = state_aggregated[state_aggregated.aggregate_level == 'county']

In [None]:
jhu_data = jhu_dataset.JHUDataset.build_from_local_github()
existing, new = jhu_data._fill_missing_state_with_county()
state_groupby_fields = ['date', 'Country_Region', 'Province_State']
existing_ma = existing[existing['Country_Region'] == 'USA'].query("Province_State == 'MA'")
new_ma = new.query("Province_State == 'MA'")
# new
existing_ma = existing_ma.set_index(state_groupby_fields)
new_ma = new_ma.set_index(state_groupby_fields)
new_ma.index.isin(existing_ma.index)



