In [16]:
import pandas as pd

locations = [["USA", "United Kingdom", "South Africa", "South Korea"], 
             ['Australia', 'Brazil', 'Canada', 'China', 'France', 'Germany', 'Japan',
              'New Zealand', 'Russia', 'Singapore', 'South Korea', 'Sweden', 'Thailand', 'USA', 
              'United Kingdom', 'Ireland', 'Spain', 'Taiwan']]
date_ranges = [["2021-01-01", "2022-11-01"],
    ["2023-03-01", "2024-06-01"]]

In [17]:
raw_metadata_reader = pd.read_csv(
    "../data/metadata.tsv.gz", 
    sep="\t",
    usecols=(
        "strain",
        "date",
        "date_submitted",
        "country",
        "division",
        "Nextstrain_clade",
        "authors",
        "genbank_accession",
        "gisaid_epi_isl",
        "originating_lab",
        "submitting_lab"
        ),
    dtype={
        "country": "category",
        "division": "category",
        "Nextstrain_clade": "category",
        },
    chunksize=100000)

In [18]:
def clean_metadata(metadata, dates, locations):
    start_date, end_date = dates
    # Filter to countries
    metadata = metadata[metadata["country"].isin(locations)]
    
    # Remove ambigious dates
    unambiguous_dates = (metadata["date"] != "?") & (metadata["date"].str.count("-") == 2)
    unambiguous_dates = unambiguous_dates & ~(metadata["date"].str.contains("X", na=True))
    metadata = metadata[unambiguous_dates].copy()
    
    # Filter to date range
    metadata["date"] = pd.to_datetime(metadata["date"])
    metadata["date_submitted"] = pd.to_datetime(metadata["date_submitted"])

    date_since_start_date = (metadata["date"] >= start_date)
    date_before_end_date = (metadata["date"] <= end_date)
    metadata = metadata[(date_since_start_date) & (date_before_end_date)].copy()
        
    # Filter records without NS clade
    metadata = metadata[~pd.isnull(metadata["Nextstrain_clade"])].copy()
    
    # Remove airport strain
    metadata = metadata[~(metadata["strain"].str.startswith("Japan/IC-"))]
    return metadata

In [19]:
def clean_chunk(chunk):
    cleaned_chunks = []
    for dates, locs in zip(date_ranges, locations):
        cleaned_chunks.append(
            clean_metadata(chunk.copy(), dates, locs)
        )
    return pd.concat(cleaned_chunks).drop_duplicates()

metadata_chunks = [clean_chunk(chunk) for chunk in raw_metadata_reader]

In [20]:
metadata = pd.concat(metadata_chunks, ignore_index=True)

In [21]:
metadata.shape

(7989725, 11)

In [23]:
acknowledgment_cols = ["strain", "gisaid_epi_isl", "genbank_accession", "authors", "originating_lab", "submitting_lab"]
acknowledgements = metadata[acknowledgment_cols]
acknowledgements.to_csv("../data/final_acknowledgements_gisaid.tsv.xz", sep="\t", compression='xz', index=False)