# Exploring COVID-19 time series data from Johns Hopkins University

This notebook analyzes the data in the [CSSEGISandData/COVID-19](https://github.com/CSSEGISandData/COVID-19) github repo. This assumes you have cloned the repo at the local path `~/github/CSSEGISandData/COVID-19`.

In [1]:
import os
import pandas as pd
import torch
import datetime

In [2]:
dirname = os.path.expanduser("~/github/CSSEGISandData/COVID-19/"
                             "csse_covid_19_data/csse_covid_19_time_series")
def read_csv(basename):
    return pd.read_csv(os.path.join(dirname, basename), header=0)
us_cases_df = read_csv("time_series_covid19_confirmed_US.csv")
us_deaths_df = read_csv("time_series_covid19_deaths_US.csv")
global_cases_df = read_csv("time_series_covid19_confirmed_global.csv")
global_deaths_df = read_csv("time_series_covid19_deaths_global.csv")

In [3]:
us_cases_df

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,3/16/21,3/17/21,3/18/21,3/19/21,3/20/21,3/21/21,3/22/21,3/23/21,3/24/21,3/25/21
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,6474,6483,6495,6498,6510,6513,6517,6525,6533,6540
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,20227,20263,20287,20317,20329,20347,20361,20354,20395,20417
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,2198,2199,2202,2206,2212,2212,2213,2213,2216,2218
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,2508,2512,2519,2521,2528,2529,2529,2530,2535,2534
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,6361,6371,6376,6380,6382,6383,6387,6388,6402,6408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,3470,3484,3495,3510,3510,3510,3530,3532,3547,3549
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,2094,2101,2101,2103,2103,2103,2109,2111,2113,2115
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,889,889,889,889,889,889,889,889,890,890


In [4]:
us_deaths_df

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,3/16/21,3/17/21,3/18/21,3/19/21,3/20/21,3/21/21,3/22/21,3/23/21,3/24/21,3/25/21
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,95,96,98,98,98,98,98,99,99,99
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,294,295,296,296,296,296,296,297,300,300
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,53,54,54,54,54,54,54,54,54,54
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,58,58,58,58,58,58,58,58,58,58
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,129,129,130,130,130,130,130,130,131,131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,9,9,9,9,9,9,9,9,9,9
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,12,12,12,12,12,12,12,12,12,12
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,26,26,26,26,26,26,26,26,26,26


In [5]:
global_cases_df

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/16/21,3/17/21,3/18/21,3/19/21,3/20/21,3/21/21,3/22/21,3/23/21,3/24/21,3/25/21
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,55995,56016,56044,56069,56093,56103,56153,56177,56192,56226
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,118492,118938,119528,120022,120541,121200,121544,121847,122295,122767
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,115540,115688,115842,115970,116066,116157,116255,116349,116438,116543
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,11319,11360,11393,11431,11481,11517,11545,11591,11638,11687
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,21446,21489,21558,21642,21696,21733,21757,21774,21836,21914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,,Vietnam,14.058324,108.277199,0,2,2,2,2,2,...,2560,2567,2570,2571,2572,2572,2575,2575,2576,2579
270,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,213791,215984,218061,219912,221391,223638,225976,228044,230076,232038
271,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2969,3037,3126,3217,3278,3418,3516,3612,3703,3816
272,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,85240,85502,85889,86059,86273,86449,86535,86779,86993,87318


In [6]:
global_deaths_df

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/16/21,3/17/21,3/18/21,3/19/21,3/20/21,3/21/21,3/22/21,3/23/21,3/24/21,3/25/21
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,2460,2460,2462,2462,2462,2463,2464,2466,2466,2467
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,2077,2092,2106,2122,2133,2137,2145,2156,2171,2184
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,3045,3048,3051,3053,3055,3057,3061,3066,3069,3071
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,113,113,113,113,113,113,113,113,114,114
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,522,522,522,524,526,527,530,530,532,532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,,Vietnam,14.058324,108.277199,0,0,0,0,0,0,...,35,35,35,35,35,35,35,35,35,35
270,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,2314,2343,2358,2379,2406,2427,2458,2478,2501,2521
271,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,707,713,723,733,737,751,771,785,800,810
272,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,1167,1170,1175,1178,1178,1179,1182,1185,1187,1191


In [7]:
print(us_cases_df.columns[6:])
print(us_deaths_df.columns[6:])
print(global_cases_df.columns)
print(global_deaths_df.columns)

Index(['Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key',
       '1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20',
       ...
       '3/16/21', '3/17/21', '3/18/21', '3/19/21', '3/20/21', '3/21/21',
       '3/22/21', '3/23/21', '3/24/21', '3/25/21'],
      dtype='object', length=434)
Index(['Province_State', 'Country_Region', 'Lat', 'Long_', 'Combined_Key',
       'Population', '1/22/20', '1/23/20', '1/24/20', '1/25/20',
       ...
       '3/16/21', '3/17/21', '3/18/21', '3/19/21', '3/20/21', '3/21/21',
       '3/22/21', '3/23/21', '3/24/21', '3/25/21'],
      dtype='object', length=435)
Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '3/16/21', '3/17/21', '3/18/21', '3/19/21', '3/20/21', '3/21/21',
       '3/22/21', '3/23/21', '3/24/21', '3/25/21'],
      dtype='object', length=433)
Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
  

In [8]:
def to_torch(df, first_column):
    df = df[df.columns[first_column:]]
    return torch.from_numpy(df.to_numpy()).float()

case_data = torch.cat([to_torch(us_cases_df, first_column=11),
                       to_torch(global_cases_df, first_column=4)])
print(case_data.shape)
case_data

torch.Size([3614, 429])


tensor([[    0.,     0.,     0.,  ...,  6525.,  6533.,  6540.],
        [    0.,     0.,     0.,  ..., 20354., 20395., 20417.],
        [    0.,     0.,     0.,  ...,  2213.,  2216.,  2218.],
        ...,
        [    0.,     0.,     0.,  ...,  3612.,  3703.,  3816.],
        [    0.,     0.,     0.,  ..., 86779., 86993., 87318.],
        [    0.,     0.,     0.,  ..., 36717., 36749., 36778.]])

In [9]:
death_data = torch.cat([to_torch(us_deaths_df, first_column=12),
                        to_torch(global_deaths_df, first_column=4)])
print(death_data.shape)
death_data

torch.Size([3614, 429])


tensor([[   0.,    0.,    0.,  ...,   99.,   99.,   99.],
        [   0.,    0.,    0.,  ...,  297.,  300.,  300.],
        [   0.,    0.,    0.,  ...,   54.,   54.,   54.],
        ...,
        [   0.,    0.,    0.,  ...,  785.,  800.,  810.],
        [   0.,    0.,    0.,  ..., 1185., 1187., 1191.],
        [   0.,    0.,    0.,  ..., 1516., 1516., 1518.]])

In [10]:
import pickle
gisaid_stats = pickle.load(open("results/gisaid.stats.pkl", "rb"))
print(gisaid_stats.keys())

dict_keys(['date', 'location', 'lineage'])


In [11]:
for name, count in gisaid_stats["location"].most_common(20):
    print(f"{count}\t{name}")

257675	Europe / United Kingdom / England
30710	Europe / United Kingdom / Wales
26314	Europe / United Kingdom / Scotland
25759	North America / USA / Texas / Houston
21564	Europe / Denmark / Hovedstaden
20596	Asia / Japan
13662	Oceania / Australia / Victoria
10362	North America / USA / California
9231	North America / USA / Florida
9032	North America / USA / Michigan
8462	Europe / Denmark / Syddanmark
8019	North America / Canada / Ontario
7534	Europe / Denmark / Midtjylland
6956	North America / USA / New York / New York City
6899	Europe / Germany / North Rhine-Westphalia
6886	North America / USA / Utah
6713	Europe / Denmark / Sjaelland
5861	North America / USA / Arizona
5242	North America / USA / Washington
5215	North America / USA / Massachusetts


In [12]:
gisaid_locations = {tuple(part.strip() for part in key.lower().split("/")[1:])
                    for key in gisaid_stats["location"]}
gisaid_locations = {loc for loc in gisaid_locations if len(loc) >= 1}
print(len(gisaid_locations))
print(sorted(gisaid_locations)[:20])

7903
[('a',), ('afghanistan',), ('albania',), ('albania', 'moerfelden-walldorf'), ('algeria', 'adrar'), ('algeria', 'alger'), ('algeria', 'blida'), ('algeria', 'bordj-bou-arreridj'), ('algeria', 'boufarik'), ('algeria', 'bouira'), ('algeria', 'el oued'), ('algeria', 'laghouat'), ('algeria', 'ouargla'), ('algeria', 'sétif'), ('algeria', 'tipaza'), ('algeria', 'tizi-ouzou'), ('andorra',), ('angola', 'luanda'), ('antigua and barbuda',), ('argentina',)]


In [13]:
jhu_locations = set()
for i, row in us_cases_df[["Country_Region", "Province_State", "Admin2"]].iterrows():
    a, b, c = row
    if isinstance(c, str):
        jhu_locations.add((a.lower(), b.lower(), c.lower()))
    else:
        jhu_locations.add((a.lower(), b.lower()))
for i, row in global_cases_df[["Country/Region", "Province/State"]].iterrows():
    a, b = row
    if isinstance(b, str):
        jhu_locations.add((a.lower(), b.lower()))
    else:
        jhu_locations.add((a.lower(),))
assert len(jhu_locations) == len(us_cases_df) + len(global_cases_df)
print(len(jhu_locations))
print(sorted(jhu_locations)[:20])

3614
[('afghanistan',), ('albania',), ('algeria',), ('andorra',), ('angola',), ('antigua and barbuda',), ('argentina',), ('armenia',), ('australia', 'australian capital territory'), ('australia', 'new south wales'), ('australia', 'northern territory'), ('australia', 'queensland'), ('australia', 'south australia'), ('australia', 'tasmania'), ('australia', 'victoria'), ('australia', 'western australia'), ('austria',), ('azerbaijan',), ('bahamas',), ('bahrain',)]


In [14]:
print(len(gisaid_locations & jhu_locations))
print(len(gisaid_locations - jhu_locations))
print(len(jhu_locations - gisaid_locations))

131
7772
3483


In [15]:
gc = {loc[0] for loc in gisaid_locations}
jc = {loc[0] for loc in jhu_locations}
print(gc - jc)

{'myanmar', 'saint barthelemy', 'martinique', 'palestine', 'sint maarten', 'hong kong', 'bermuda', 'cayman islands', 'gibraltar', 'democratic republic of the congo', 'union of the comoros', 'saint martin', 'sint eustatius', 'reunion', 'england', 'french guiana', 'faroe islands', 'usa', 'united states', 'a', 'guam', 'réunion', 'mayotte', 'la reunion', 'republic of the congo', 'belgique', 'taiwan', 'cote divoire', 'guadeloupe', 'bonaire', 'crimea', 'czech republic', 'trinidad', 'romaina', 'south korea', 'aruba', 'curacao', 'british virgin islands'}


In [18]:
{row for row in jhu_locations if any("comor" in col for col in row)}

{('comoros',)}

In [19]:
from pyrocov.geo import GISAID_TO_JHU

for country in gc - jc:
    c = GISAID_TO_JHU[country]
    assert c is None or isinstance(c, tuple), c

KeyError: 'a'

## Joining with population data from UN

`WPP2019_TotalPopulationBySex.csv` was downloaded from https://population.un.org/wpp/Download/Standard/CSV/

In [None]:
df = pd.read_csv("data/WPP2019_TotalPopulationBySex.csv", header=0)
df.columns

In [None]:
df = df[df["Time"] == 2020]
df = df[df["Variant"] == "High"]

In [None]:
uc = {name.lower() for name in df["Location"].to_list()}
jc - uc

In [None]:
{row for row in uc if "myanmar" in row}

In [None]:
from pyrocov.geo import JHU_TO_UN

In [None]:
for c in jc:
    if c not in uc:
        c2 = JHU_TO_UN[c]
        if c2 is not None:
            assert c2 in uc, (c, c2)