In [None]:
from libs import notebook_helpers
notebook_helpers.set_covid_data_public()

In [None]:
import pandarallel
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd
from covidactnow.datapublic.common_fields import CommonFields
from libs.datasets.sources import fips_population
from libs.datasets import combined_datasets
from libs.datasets.latest_values_dataset import LatestValuesDataset
from libs.datasets.timeseries import TimeseriesDataset

pandarallel.pandarallel.initialize(progress_bar=True)

In [None]:
dataset = combined_datasets.load_us_timeseries_dataset()
latest = dataset.to_latest_values_dataset()

sources = notebook_helpers.load_data_sources_by_name()
sources_latest = {name: source.latest_values() for name, source in sources.items()}
combined_latest_data = latest.data.copy()
combined_latest_data['source'] = 'Combined'
sources_latest["Combined Data"] = combined_latest_data

all_sources_latest = LatestValuesDataset(pd.concat(sources_latest.values()))

In [None]:
def build_prevalence_report(latest: LatestValuesDataset):

    data = latest.data.copy()
    if 'population' not in data.columns:
        pop = fips_population.FIPSPopulation.local()
        pop_map = pop.data.set_index('fips')["population"]
        data['population'] = data['fips'].map(pop_map)

    def classify_row(row):

        if row.aggregate_level == "state":
            return "state data"
        return row.state

    def count_with_values(x):
        return x.apply(lambda y: sum(~y.isna()))

    data["location_group"] = data.apply(classify_row, axis=1)


    counts_per_location = data.groupby("location_group").apply(count_with_values)
    columns_to_drop = ['state', 'country', 'aggregate_level', 'cumulative_hospitalized', 'cumulative_icu']
    columns_to_drop = [column for column in columns_to_drop if column in counts_per_location.columns]

    counts_per_location = counts_per_location.drop(columns_to_drop, axis='columns')
    counts_per_location["total_population"] = data.groupby("location_group").population.sum()
    counts_per_location = counts_per_location.sort_values("total_population", ascending=False).drop(["total_population"], axis='columns')

    return (
        counts_per_location
        .style
        .background_gradient(axis=1, cmap='RdYlGn')
    )


def data_availability_by_field(latest_dataset, field):
    data = all_sources_latest.data
    columns = [CommonFields.FIPS, CommonFields.AGGREGATE_LEVEL, CommonFields.STATE] + [field, "source"]
    data[columns]
    data = data.set_index(["fips", "aggregate_level", "state", "source"])
    series = data[field]
    field_by_source = series.unstack(level=-1)
    field_by_source.columns = field_by_source.columns.get_level_values(0).values
    field_by_source = field_by_source.reset_index()
    field_by_source = LatestValuesDataset(field_by_source)
    return build_prevalence_report(field_by_source)



# Data availability for a specific field across data sources

In [None]:
not_included_columns = ['fips', 'date', 'state', 'county', 'country', 'aggregate_level']
columns = [column for column in all_sources_latest.data.columns if column not in not_included_columns]


@interact
def show_field_data_sources(field=widgets.Select(options=sorted(columns))):
    display(data_availability_by_field(all_sources_latest, field))


# Data Availability across all fields for a single data source

In [None]:
select_widget = widgets.Select(options=list({k: LatestValuesDataset(v) for k, v in sources_latest.items()}.items()))


@interact
def show_provenance_by_source_sources(dataset=select_widget):
    return build_prevalence_report(dataset)

In [None]:
# Sample code to save provenance image
# import imgkit

# html = build_prevalence_report(LatestValuesDataset(sources_latest["CmdcDataSource"])).render()
# imgkit.from_string(html, 'styled_table.png')