In [1]:
import pandas as pd
import numpy as np

## Exporting acknowledgements for sequences used in analysis

The `giasaid_metadata_pruned.tsv` is generated from the Nextstrain-curated GISAID metadata. Details for this can be found in `../data/README.md`.

In [2]:
# https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz
raw_metadata_reader = pd.read_csv(
    "../data/gisaid_metadata.tsv", 
    sep="\t",
    usecols=(
        "strain",
        "date",
        "date_submitted",
        "country",
        "division",
        "Nextstrain_clade",
        "authors",
        "genbank_accession",
        "gisaid_epi_isl",
        "originating_lab",
        "submitting_lab"
        ),
    dtype={
        "country": "category",
        "division": "category",
        "Nextstrain_clade": "category",
        },
    chunksize=100000)

In [3]:
# Countries of interest
locations = ["USA", "Japan", "United Kingdom", "Australia", "Brazil", "South Africa", "Denmark", "Germany"]
start_date = "2021-09-01"
end_date = "2022-12-31"
end_forecast_period = "2023-01-31"

# Borrowing from John here
def clean_metadata(metadata):
    # Filter to countries
    metadata = metadata[metadata["country"].isin(locations)]
    
    # Remove ambigious dates
    unambiguous_dates = (metadata["date"] != "?") & (metadata["date"].str.count("-") == 2)
    unambiguous_dates = unambiguous_dates & ~(metadata["date"].str.contains("X", na=True))
    metadata = metadata[unambiguous_dates].copy()
    
    # Filter to date range
    metadata["date"] = pd.to_datetime(metadata["date"])
    metadata["date_submitted"] = pd.to_datetime(metadata["date_submitted"])

    date_since_start_date = (metadata["date"] >= start_date)
    date_before_end_date = (metadata["date"] <= end_forecast_period)
    metadata = metadata[(date_since_start_date) & (date_before_end_date)].copy()
        
    # Filter records without NS clade
    metadata = metadata[~pd.isnull(metadata["Nextstrain_clade"])].copy()
    
    # Remove airport strain
    metadata = metadata[~(metadata["strain"].str.startswith("Japan/IC-"))]
    return metadata

In [4]:
metadata_chunks = [clean_metadata(chunk) for chunk in raw_metadata_reader]

In [5]:
metadata = pd.concat(metadata_chunks, ignore_index=True)

In [6]:
# Generate acknowledgments
acknowledgment_cols = ["strain", "gisaid_epi_isl", "genbank_accession", "authors", "originating_lab", "submitting_lab"]
acknowledgements = metadata[acknowledgment_cols]

In [7]:
acknowledgements.to_csv("./final_acknowledgements_gisaid.tsv.gz", sep="\t", compression='gzip', index=False)