In [1]:
import pandas as pd

## Making sequence count data sets from pruned GISAID metadata

The `giasaid_metadata_pruned.tsv` is generated from the Nextstrain-curated GISAID metadata. Details for this can be found in `../data/README.md`.

In [4]:
# https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz
raw_metadata_reader = pd.read_csv(
    "../data/gisaid_metadata_pruned.tsv", 
    sep="\t",
    usecols=(
        "strain",
        "date",
        "date_submitted",
        "country",
        "division",
        "Nextstrain_clade",
        "QC_overall_status",
        ),
    dtype={
        "country": "category",
        "division": "category",
        "Nextstrain_clade": "category",
        },
    chunksize=100000)

raw_metadata_reader_past = pd.read_csv(
    "../data/metadata_2022-10-12.tsv.gz", 
    sep="\t",
    usecols=(
        "strain",
        "date",
        "date_submitted",
        "country",
        "division",
        "Nextstrain_clade",
        "QC_overall_status",
        ),
    dtype={
        "country": "category",
        "division": "category",
        "Nextstrain_clade": "category",
        },
    chunksize=100000)

In [5]:
# Countries of interest
locations = ["USA", "Japan", "United Kingdom", "Australia", "Brazil", "South Africa"]
locations += ["Egypt", "Panama", "Trinidad and Tobago", "Costa Rica", "Hong King", "Vietnam"]
start_date = "2021-09-01"
end_date = "2022-12-31"
end_forecast_period = "2023-01-31"

# Borrowing from John here
def clean_metadata(metadata):
    # Filter to countries
    metadata = metadata[metadata["country"].isin(locations)]
    
    # Remove ambigious dates
    unambiguous_dates = (metadata["date"] != "?") & (metadata["date"].str.count("-") == 2)
    unambiguous_dates = unambiguous_dates & ~(metadata["date"].str.contains("X", na=True))
    metadata = metadata[unambiguous_dates].copy()
    
    # Filter to date range
    metadata["date"] = pd.to_datetime(metadata["date"])
    metadata["date_submitted"] = pd.to_datetime(metadata["date_submitted"])

    date_since_start_date = (metadata["date"] >= start_date)
    date_before_end_date = (metadata["date"] <= end_forecast_period)
    metadata = metadata[(date_since_start_date) & (date_before_end_date)].copy()
        
    # Filter records without NS clade
    metadata = metadata[~pd.isnull(metadata["Nextstrain_clade"])].copy()
    
    # Remove airport strain
    metadata = metadata[~(metadata["strain"].str.startswith("Japan/IC-"))]
    return metadata

# Map clade to variant
clade_to_variant = {"21A (Delta)" : "Delta", 
                    "21I (Delta)" : "Delta", 
                    "21J (Delta)" : "Delta", 
                    "21K (Omicron)" : "Omicron 21K", 
                    "21L (Omicron)" : "Omicron 21L", 
                    "22A (Omicron)" : "Omicron 22A", 
                    "22B (Omicron)" : "Omicron 22B", 
                    "22C (Omicron)" : "Omicron 22C",
                    "22D (Omicron)" : "Omicron 22D",
                    "22E (Omicron)" : "Omicron 22E",
                    "23A (Omicron)" : "Omicron 23A"}

def prep_metadata(raw_metadata_reader):
    metadata_chunks = [clean_metadata(chunk) for chunk in raw_metadata_reader]
    metadata = pd.concat(metadata_chunks, ignore_index=True)

    metadata["variant"] = (metadata["Nextstrain_clade"]
                        .map(clade_to_variant)
                        .fillna("other")
                        )
    return metadata

metadata = prep_metadata(raw_metadata_reader)
metadata_past = prep_metadata(raw_metadata_reader_past)

## Creating data sets as of given observation dates

In [6]:
def count_sequences_with_submission_date(metadata):    
    grouped = metadata.groupby(["date", "country", "variant"]) 
    
    def compute_delay(x):
        delays = (x["date_submitted"]
                  .value_counts(normalize=False)
                  .rename_axis('date_submitted')
                  .reset_index(name='sequences'))
        
        delays = delays.sort_values("date_submitted")
        return delays
    
    out = (grouped
           .apply(compute_delay)
           .reset_index()
           .drop(columns=["level_3"])
           .rename(columns={"country": "location"}))
    return out

delayed = count_sequences_with_submission_date(metadata)
delayed_past = count_sequences_with_submission_date(metadata_past)

In [8]:
def observe_sequence_counts(delayed, obs_date=None):
    # Given an observation date as well as counts of sequences and their submission dates,
    # Reconstruct data available on observation date
    
    obs_seq = delayed.copy()
    
    # Filter to sequences submitted on or before date
    if obs_date:
        obs_seq = obs_seq[obs_seq["date_submitted"] < obs_date]
        
    # Sum across remaining sequences
    obs_seq = (obs_seq
     .groupby(["date", "location", "variant"])["sequences"]
     .sum()).reset_index()
    
    # Sort data
    obs_seq = obs_seq.sort_values(["location", "variant", "date"])
        
    # Remove entries with no observed sequences
    obs_seq = obs_seq[obs_seq["sequences"] > 0]
    return obs_seq

In [9]:
obs_dates = ["2022-09-01", "2022-09-15", "2022-10-01", "2022-10-15", "2022-11-01"]

In [10]:
# "https://covid.ourworldindata.org/data/owid-covid-data.csv"
raw_cases = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
raw_cases["location"].replace({"United States": "USA"}, inplace=True) #Map United States to USA
raw_cases = raw_cases[raw_cases.location.isin(locations)]
raw_cases = (raw_cases[["date", "location", "new_cases"]]
             .rename(columns={"new_cases": "cases"}))
raw_cases['cases'] = raw_cases['cases'].fillna(0)

In [11]:
import os

# Now we just loop over observation dates
for date in obs_dates:
    
    # Observe sequences up to this date
    obs_seq = observe_sequence_counts(delayed, obs_date=date)
    obs_seq_past = observe_sequence_counts(delayed_past, obs_date=date)
    
    # Filter to most recent 104 days
    min_date = (pd.to_datetime(date) - pd.Timedelta(90 + 14,'d'))
    obs_seq = obs_seq[obs_seq.date > min_date]
    obs_seq_past = obs_seq_past[obs_seq_past.date > min_date]
    
    # Remove most recent 14 days due to bias
    max_date = (pd.to_datetime(date) - pd.Timedelta(14,'d'))
    obs_seq = obs_seq[obs_seq.date <= max_date]
    obs_seq_past = obs_seq_past[obs_seq_past.date <= max_date]

    # Add cases up to present 
    obs_cases = raw_cases[raw_cases.date <= date]
    obs_cases = obs_cases[pd.to_datetime(obs_cases.date) > min_date]
    
    # Export file
    path = "../data/real_time_nextclade/" + date
    if not os.path.exists(path):
        os.makedirs(path)
        
    # Make sure we have the folder
    obs_seq.to_csv(f"{path}/seq_counts_{date}.tsv", sep="\t", index = False)
    obs_seq_past.to_csv(f"{path}/seq_counts_{date}_past.tsv", sep="\t", index = False)
    obs_cases.to_csv(f"{path}/case_counts_{date}.tsv", sep="\t", index = False)

In [12]:
# Get final truth sets
obs_seq = observe_sequence_counts(delayed, obs_date=None)
obs_cases = raw_cases

# Make sure we have the folder
path = "../data/real_time_nextclade/truth"
if not os.path.exists(path):
        os.makedirs(path)
        
obs_seq.to_csv(f"{path}/seq_counts_truth.tsv", sep="\t", index = False)
obs_cases.to_csv(f"{path}/case_counts_truth.tsv", sep="\t", index = False)