# Exploring COVID-19 time series data from Johns Hopkins University

This notebook analyzes the data in the [CSSEGISandData/COVID-19](https://github.com/CSSEGISandData/COVID-19) github repo. This assumes you have cloned the repo at the local path `~/github/CSSEGISandData/COVID-19`.

In [None]:
import os
import pandas as pd
import torch
import datetime

In [None]:
dirname = os.path.expanduser("~/github/CSSEGISandData/COVID-19/"
                             "csse_covid_19_data/csse_covid_19_time_series")
def read_csv(basename):
    return pd.read_csv(os.path.join(dirname, basename), header=0)
us_cases_df = read_csv("time_series_covid19_confirmed_US.csv")
us_deaths_df = read_csv("time_series_covid19_deaths_US.csv")
global_cases_df = read_csv("time_series_covid19_confirmed_global.csv")
global_deaths_df = read_csv("time_series_covid19_deaths_global.csv")

In [None]:
us_cases_df

In [None]:
us_deaths_df

In [None]:
global_cases_df

In [None]:
global_deaths_df

In [None]:
print(us_cases_df.columns[6:])
print(us_deaths_df.columns[6:])
print(global_cases_df.columns)
print(global_deaths_df.columns)

In [None]:
def to_torch(df, first_column):
    df = df[df.columns[first_column:]]
    return torch.from_numpy(df.to_numpy()).float()

case_data = torch.cat([to_torch(us_cases_df, first_column=11),
                       to_torch(global_cases_df, first_column=4)])
print(case_data.shape)
case_data

In [None]:
death_data = torch.cat([to_torch(us_deaths_df, first_column=12),
                        to_torch(global_deaths_df, first_column=4)])
print(death_data.shape)
death_data

In [None]:
import pickle
gisaid_stats = pickle.load(open("results/gisaid.stats.pkl", "rb"))
print(gisaid_stats.keys())

In [None]:
for name, count in gisaid_stats["location"].most_common(30):
    print(f"{count}\t{name}")

In [None]:
gisaid_locations = {tuple(part.strip() for part in key.lower().split("/")[1:])
                    for key in gisaid_stats["location"]}
gisaid_locations = {loc for loc in gisaid_locations if len(loc) >= 1}
print(len(gisaid_locations))
print(sorted(gisaid_locations)[:20])

In [None]:
jhu_locations = set()
for i, row in us_cases_df[["Country_Region", "Province_State", "Admin2"]].iterrows():
    a, b, c = row
    if isinstance(c, str):
        jhu_locations.add((a.lower(), b.lower(), c.lower()))
    else:
        jhu_locations.add((a.lower(), b.lower()))
for i, row in global_cases_df[["Country/Region", "Province/State"]].iterrows():
    a, b = row
    if isinstance(b, str):
        jhu_locations.add((a.lower(), b.lower()))
    else:
        jhu_locations.add((a.lower(),))
assert len(jhu_locations) == len(us_cases_df) + len(global_cases_df)
print(len(jhu_locations))
print(sorted(jhu_locations)[:20])

In [None]:
print(len(gisaid_locations & jhu_locations))
print(len(gisaid_locations - jhu_locations))
print(len(jhu_locations - gisaid_locations))

In [None]:
gc = {loc[0] for loc in gisaid_locations}
jc = {loc[0] for loc in jhu_locations}
print(gc - jc)

In [None]:
{row for row in jhu_locations if any("bosnia" in col for col in row)}

In [None]:
from pyrocov.geo import GISAID_TO_JHU

for country in gc - jc:
    c = GISAID_TO_JHU[country]
    assert c is None or isinstance(c, tuple), c

## Joining with population data from UN

`WPP2019_TotalPopulationBySex.csv` was downloaded from https://population.un.org/wpp/Download/Standard/CSV/

In [None]:
df = pd.read_csv("data/WPP2019_TotalPopulationBySex.csv", header=0)
df.columns

In [None]:
df = df[df["Time"] == 2020]
df = df[df["Variant"] == "High"]

In [None]:
uc = {name.lower() for name in df["Location"].to_list()}
jc - uc

In [None]:
{row for row in uc if "myanmar" in row}

In [None]:
from pyrocov.geo import JHU_TO_UN

In [None]:
for c in jc:
    if c not in uc:
        c2 = JHU_TO_UN[c]
        if c2 is not None:
            assert c2 in uc, (c, c2)