In [1]:
import argparse
import datetime
import logging
import math
import pickle
import re
from collections import Counter, defaultdict

import pandas as pd
import torch
import tqdm

from pyrocov.geo import get_canonical_location_generator, gisaid_normalize
from pyrocov.mutrans import START_DATE
from pyrocov.sarscov2 import nuc_mutations_to_aa_mutations
from pyrocov.usher import (
    FineToMeso,
    load_mutation_tree,
    load_proto,
    prune_mutation_tree,
    refine_mutation_tree,
)
from pyrocov.util import gzip_open_tqdm

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(relativeCreated) 9d %(message)s", level=logging.INFO)

DATE_FORMATS = {7: "%Y-%m", 10: "%Y-%m-%d"}

In [2]:
parser = argparse.ArgumentParser(description="Preprocess pangolin mutations")
parser.add_argument(
    "--usher-metadata-file-in", default="results/usher/metadata.tsv"
)
parser.add_argument(
    "--nextstrain-metadata-file-in", default="results/nextstrain/metadata.tsv"
)
parser.add_argument("--gisaid-metadata-file-in", default="")
parser.add_argument("--tree-file-in", default="results/usher/all.masked.pb")
parser.add_argument("--tree-file-out", default="results/lineageTree.fine.pb")
parser.add_argument("--stats-file-out", default="results/stats.pkl")
parser.add_argument("--recover-missing-usa-state", action="store_true")
parser.add_argument("-s", "--max-skippage", type=float, default=1e7)
parser.add_argument("-c", "--max-num-clades", default="2000,3000,5000,10000")
parser.add_argument("--start-date", default=START_DATE)
args = parser.parse_args()
args.start_date = try_parse_date(args.start_date)

usage: ipykernel_launcher.py [-h]
                             [--usher-metadata-file-in USHER_METADATA_FILE_IN]
                             [--nextstrain-metadata-file-in NEXTSTRAIN_METADATA_FILE_IN]
                             [--gisaid-metadata-file-in GISAID_METADATA_FILE_IN]
                             [--tree-file-in TREE_FILE_IN]
                             [--tree-file-out TREE_FILE_OUT]
                             [--stats-file-out STATS_FILE_OUT]
                             [--recover-missing-usa-state] [-s MAX_SKIPPAGE]
                             [-c MAX_NUM_CLADES] [--start-date START_DATE]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/kotzen/.local/share/jupyter/runtime/kernel-2cfce288-f99d-4dbc-b49a-11fb6dff9660.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
import gzip

from pyrocov.usher import load_proto
from pyrocov.usher import parsimony_pb2

In [2]:
filename = "results/gisaid/gisaidAndPublic.2021-03-26.masked.pb.gz"
proto, tree = load_proto(filename)

In [3]:
clades = list(tree.find_clades())

In [4]:
print(len(proto.node_mutations))
print(len(proto.metadata))
print(len(clades))

689770
689770
1177


In [15]:
clades[:10]

[Clade(branch_length=0.0),
 Clade(branch_length=1.0),
 Clade(branch_length=1.0, name='England/BRIS-1853249/2020|20-04-02'),
 Clade(branch_length=1.0, name='Chile/RM-ISPCH-50/2020|EPI_ISL_445329|2020-03-23'),
 Clade(branch_length=1.0, name='Ireland/CO-20G33632/2020|EPI_ISL_848106|2020-03-18'),
 Clade(branch_length=0.0, name='node_2_condensed_6_leaves'),
 Clade(branch_length=1.0),
 Clade(branch_length=1.0, name='Wales/PHWC-25B04/2020|20-03-24'),
 Clade(branch_length=1.0, name='Wales/PHWC-2414F/2020|20-03-16'),
 Clade(branch_length=1.0)]

In [13]:
open_ = gzip.open if filename.endswith(".gz") else open
with open_(filename, "rb") as f:
    proto = parsimony_pb2.data.FromString(f.read())  # type: ignore

In [41]:
mutations=list(proto.node_mutations)
metadata = list(proto.metadata)
type(metadata[0])

parsimony_pb2.node_metadata

In [18]:
len(clades)

1177