In [1]:
import pandas as pd
import numpy as np

## Making sequence count data sets from pruned GISAID metadata

The `giasaid_metadata_pruned.tsv` is generated from the Nextstrain-curated GISAID metadata. Details for this can be found in `../data/README.md`.

In [2]:
# https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz
raw_metadata_reader = pd.read_csv(
    "../data/gisaid_metadata_pruned.tsv", 
    sep="\t",
    usecols=(
        "strain",
        "date",
        "date_submitted",
        "country",
        "division",
        "Nextstrain_clade"
        ),
    dtype={
        "country": "category",
        "division": "category",
        "Nextstrain_clade": "category",
        },
    chunksize=100000)

In [3]:
# Countries of interest
locations = ["USA", "Japan", "United Kingdom", "Australia", "Brazil", "South Africa"]
start_date = "2021-09-01"
end_date = "2022-12-31"
end_forecast_period = "2023-01-31"

# Borrowing from John here
def clean_metadata(metadata):
    # Filter to countries
    metadata = metadata[metadata["country"].isin(locations)]
    
    # Remove ambigious dates
    unambiguous_dates = (metadata["date"] != "?") & (metadata["date"].str.count("-") == 2)
    unambiguous_dates = unambiguous_dates & ~(metadata["date"].str.contains("X", na=True))
    metadata = metadata[unambiguous_dates].copy()
    
    # Filter to date range
    metadata["date"] = pd.to_datetime(metadata["date"])
    metadata["date_submitted"] = pd.to_datetime(metadata["date_submitted"])

    date_since_start_date = (metadata["date"] >= start_date)
    date_before_end_date = (metadata["date"] <= end_forecast_period)
    metadata = metadata[(date_since_start_date) & (date_before_end_date)].copy()
        
    # Filter records without NS clade
    metadata = metadata[~pd.isnull(metadata["Nextstrain_clade"])].copy()
    
    # Remove bad sequences (https://github.com/blab/rt-from-frequency-dynamics/tree/master/data/omicron-countries-split)
    return metadata

In [4]:
metadata_chunks = [clean_metadata(chunk) for chunk in raw_metadata_reader]

In [5]:
metadata = pd.concat(metadata_chunks, ignore_index=True)
metadata

Unnamed: 0,strain,date,date_submitted,country,division,Nextstrain_clade
0,AmericanSamoa/AS-CDC-2-5343430/2021,2021-09-16,2022-01-13,USA,American Samoa,21J (Delta)
1,AmericanSamoa/AS-CDC-2-5343436/2021,2021-11-04,2022-01-13,USA,American Samoa,21J (Delta)
2,AmericanSamoa/AS-CDC-2-5343518/2021,2021-09-30,2022-01-13,USA,American Samoa,21J (Delta)
3,AmericanSamoa/AS-CDC-2-5343531/2021,2021-09-30,2022-01-13,USA,American Samoa,21J (Delta)
4,AmericanSamoa/AS-CDC-2-5343532/2021,2021-12-09,2022-01-13,USA,American Samoa,21K (Omicron)
...,...,...,...,...,...,...
6057080,tiger/USA/NE-TIGER_NCOV21-40167/2021,2021-11-07,2021-11-10,USA,Nebraska,21J (Delta)
6057081,tiger/USA/OH-21-028714-001/2021,2021-09-15,2021-11-02,USA,Ohio,21J (Delta)
6057082,tiger/USA/OH-21-038314-003/2021,2021-12-13,2021-12-31,USA,Ohio,21J (Delta)
6057083,tiger/USA/PA-21-037088-003/2021,2021-12-07,2021-12-28,USA,Pennsylvania,21J (Delta)


In [6]:
# Map clade to variant
clade_to_variant = {"21A (Delta)" : "Delta", 
                    "21I (Delta)" : "Delta", 
                    "21J (Delta)" : "Delta", 
                    "21K (Omicron)" : "Omicron 21K", 
                    "21L (Omicron)" : "Omicron 21L", 
                    "22A (Omicron)" : "Omicron 22A", 
                    "22B (Omicron)" : "Omicron 22B", 
                    "22C (Omicron)" : "Omicron 22C",
                    "22D (Omicron)" : "Omicron 22D",
                    "22E (Omicron)" : "Omicron 22E",
                    "23A (Omicron)" : "Omicron 23A"}

metadata["variant"] = (metadata["Nextstrain_clade"]
                       .map(clade_to_variant)
                       .fillna("other")
                      )
metadata

Unnamed: 0,strain,date,date_submitted,country,division,Nextstrain_clade,variant
0,AmericanSamoa/AS-CDC-2-5343430/2021,2021-09-16,2022-01-13,USA,American Samoa,21J (Delta),Delta
1,AmericanSamoa/AS-CDC-2-5343436/2021,2021-11-04,2022-01-13,USA,American Samoa,21J (Delta),Delta
2,AmericanSamoa/AS-CDC-2-5343518/2021,2021-09-30,2022-01-13,USA,American Samoa,21J (Delta),Delta
3,AmericanSamoa/AS-CDC-2-5343531/2021,2021-09-30,2022-01-13,USA,American Samoa,21J (Delta),Delta
4,AmericanSamoa/AS-CDC-2-5343532/2021,2021-12-09,2022-01-13,USA,American Samoa,21K (Omicron),Omicron 21K
...,...,...,...,...,...,...,...
6057080,tiger/USA/NE-TIGER_NCOV21-40167/2021,2021-11-07,2021-11-10,USA,Nebraska,21J (Delta),Delta
6057081,tiger/USA/OH-21-028714-001/2021,2021-09-15,2021-11-02,USA,Ohio,21J (Delta),Delta
6057082,tiger/USA/OH-21-038314-003/2021,2021-12-13,2021-12-31,USA,Ohio,21J (Delta),Delta
6057083,tiger/USA/PA-21-037088-003/2021,2021-12-07,2021-12-28,USA,Pennsylvania,21J (Delta),Delta


## Creating data sets as of given observation dates

In [7]:
def count_sequences_with_submission_date(metadata):    
    grouped = metadata.groupby(["date", "country", "variant"]) 
    
    def compute_delay(x):
        delays = (x["date_submitted"]
                  .value_counts(normalize=False)
                  .rename_axis('date_submitted')
                  .reset_index(name='sequences'))
        
        delays = delays.sort_values("date_submitted")
        return delays
    
    out = (grouped
           .apply(compute_delay)
           .reset_index()
           .drop(columns=["level_3"])
           .rename(columns={"country": "location"}))
    return out

delayed = count_sequences_with_submission_date(metadata)
delayed

Unnamed: 0,date,location,variant,date_submitted,sequences
0,2021-09-01,Australia,Delta,2021-09-03,1
1,2021-09-01,Australia,Delta,2021-09-09,91
2,2021-09-01,Australia,Delta,2021-09-13,7
3,2021-09-01,Australia,Delta,2021-09-15,26
4,2021-09-01,Australia,Delta,2021-09-16,1
...,...,...,...,...,...
202519,2023-01-31,United Kingdom,other,2023-02-20,3
202520,2023-01-31,United Kingdom,other,2023-02-22,3
202521,2023-01-31,United Kingdom,other,2023-02-23,4
202522,2023-01-31,United Kingdom,other,2023-02-24,1


In [8]:
delayed.to_csv(f"./seq_counts_delayed.tsv", sep="\t", index = False)

In [9]:
def observe_sequence_counts(delayed, obs_date=None):
    # Given an observation date as well as counts of sequences and their submission dates,
    # Reconstruct data available on observation date
    
    obs_seq = delayed.copy()
    
    # Filter to sequences submitted on or before date
    if obs_date:
        obs_seq = obs_seq[obs_seq["date_submitted"] < obs_date]
        
    # Sum across remaining sequences
    obs_seq = (obs_seq
     .groupby(["date", "location", "variant"])["sequences"]
     .sum()).reset_index()
    
    # Sort data
    obs_seq = obs_seq.sort_values(["location", "variant", "date"])
        
    # Remove entries with no observed sequences
    obs_seq = obs_seq[obs_seq["sequences"] > 0]
    
    return obs_seq

In [10]:
# Testing observation dates
obs_seq = observe_sequence_counts(delayed, obs_date="2022-06-30")
obs_seq.date.max()

Timestamp('2022-06-26 00:00:00')

In [11]:
def make_observation_dates(obs_interval=30):
    total_interval = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days
    ndays = np.arange(90 + 14, total_interval, obs_interval).astype(int)
    if total_interval not in ndays:
        ndays = np.append(ndays, total_interval)

    obs_dates = [pd.to_datetime(start_date) + pd.Timedelta(nday, 'd') for nday in ndays]
    obs_dates = [date.strftime("%Y-%m-%d") for date in obs_dates]
    return obs_dates

def make_observation_dates():
    return[f"2022-{m:02}-01" for m in range(1, 13)]
    

In [12]:
obs_dates = make_observation_dates()
obs_dates

['2022-01-01',
 '2022-02-01',
 '2022-03-01',
 '2022-04-01',
 '2022-05-01',
 '2022-06-01',
 '2022-07-01',
 '2022-08-01',
 '2022-09-01',
 '2022-10-01',
 '2022-11-01',
 '2022-12-01']

In [13]:
# "https://covid.ourworldindata.org/data/owid-covid-data.csv"
raw_cases = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
raw_cases["location"].replace({"United States": "USA"}, inplace=True) #Map United States to USA
raw_cases = raw_cases[raw_cases.location.isin(locations)]
raw_cases = (raw_cases[["date", "location", "new_cases"]]
             .rename(columns={"new_cases": "cases"}))
raw_cases['cases'] = raw_cases['cases'].fillna(0)

In [14]:
import os

# Now we just loop over observation dates
for date in obs_dates:
    
    # Observe sequences up to this date
    obs_seq = observe_sequence_counts(delayed, obs_date=date)
    
    # Filter to most recent 104 days
    min_date = (pd.to_datetime(date) - pd.Timedelta(90 + 14,'d'))
    obs_seq = obs_seq[obs_seq.date > min_date]
    
    # Remove most recent 14 days due to bias
    max_date = (pd.to_datetime(date) - pd.Timedelta(14,'d'))
    obs_seq = obs_seq[obs_seq.date <= max_date]

    # Add cases up to present 
    obs_cases = raw_cases[raw_cases.date <= date]
    obs_cases = obs_cases[pd.to_datetime(obs_cases.date) > min_date]

    
    # Export file
    path = "../data/time_stamped/" + date
    if not os.path.exists(path):
        os.makedirs(path)
        
    # Make sure we have the folder
    obs_seq.to_csv(f"{path}/seq_counts_{date}.tsv", sep="\t", index = False)
    obs_cases.to_csv(f"{path}/case_counts_{date}.tsv", sep="\t", index = False)

In [15]:
# Get final truth sets
obs_seq = observe_sequence_counts(delayed, obs_date=None)
obs_cases = raw_cases

# Make sure we have the folder
path = "../data/time_stamped/truth"
if not os.path.exists(path):
        os.makedirs(path)
        
obs_seq.to_csv(f"{path}/seq_counts_truth.tsv", sep="\t", index = False)
obs_cases.to_csv(f"{path}/case_counts_truth.tsv", sep="\t", index = False)