# Simplifed versions of subplots for XA, XN/XAU, and  XZ/XAC/XAD/XAE/XAP

These are used to produce the node positionings and topology for Fig 2. See supp_pdf-subgraphs-PangoX.ipynb for subgraphs of all the Pango X lineages.

In [1]:
import sc2ts.info
import nb_utils
import numpy as np
from IPython.display import HTML
import tszip


ts = tszip.load("../data/sc2ts_viridian_v1.1.trees.tsz")

# Join with the associated data
df = sc2ts.node_data(ts).set_index("sample_id")

# Load in the ARG to the visualizer - can take a few minutes
arg = nb_utils.D3ARG_viz(ts, df, pangolin_field='pango')

Edges:   0%|          | 0/2745566 [00:00<?, ?it/s]

Sites:   0%|          | 0/29893 [00:00<?, ?it/s]

Nodes:   0%|          | 0/2744713 [00:00<?, ?it/s]

In [2]:
arg.set_sc2ts_node_labels()
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 200039, 'label'] = "DELTA-origin"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 822854, 'label'] = "BA.2-origin"
arg.d3arg.nodes.loc[arg.d3arg.nodes.id == 1189192, 'label'] = "BA.5-origin"
arg.set_sc2ts_node_styles()

Setting all labels:   0%|          | 0/2744713 [00:00<?, ?it/s]

In [3]:
import collections

def num_sample_desc(local_root, subtract=None, name=None):
    d = collections.defaultdict(set)
    for tree in ts.trees():
        for u in tree.samples(local_root):
            d[ts.node(u).metadata['pango']].add(u)
    full_counts = {k: len(v) for k, v in d.items()}
    counts = full_counts.copy()
    if subtract is not None:
        for subtract_dict in subtract:
            for k, v in subtract_dict.items():
                counts[k] -= v
    info = f"Hidden sample descendants of {local_root} "
    if name:
        info += f"({name})) "
    if ts.node(local_root).is_sample():
        root_pango = ts.node(local_root).metadata['pango']
        counts[root_pango] -= 1
        print(info + f"= {counts} (excluding top {root_pango} node)")
    else:
        print(info + f"= {counts}")
    return full_counts

In [4]:
# Exclude all XAs apart from a select few
dfXA = df[np.logical_and(df.pango == "XA", df.is_sample)]
exclude = dfXA.node_id[np.isin(dfXA.index, ["ERR5308556", "ERR5414941", "ERR5651144"]) == False].values

arg.plot_pango_subgraph(
    "XA",
    height=500,
    exclude=exclude,
    y_axis_scale="rank",
    oldest_y_label="2020-12",
    child_levels=0,
    parent_levels=2,
    positions_file="layout_data/XA-min.json",
    highlight_nodes={'yellow': dfXA.node_id},
)

In [5]:
xa1 = num_sample_desc(190315)
xa2 = num_sample_desc(2704421, subtract=[xa1])
xa3 = num_sample_desc(183949, subtract=[xa2])
print("== RE node == ")
print("The following should print zero samples of all types")
rt = num_sample_desc(122444, subtract=[xa3, {"XA": 2}], name="RE node")
assert len(df[np.logical_and(df.pango=="XA", df.is_sample)]) == rt["XA"]
print("== Parents == ")
num_sample_desc(99722, subtract=[rt])
num_sample_desc(78638, subtract=[rt]);

Hidden sample descendants of 190315 = {'XA': 11} (excluding top XA node)
Hidden sample descendants of 2704421 = {'XA': 1}
Hidden sample descendants of 183949 = {'XA': 24}
== RE node == 
The following should print zero samples of all types
Hidden sample descendants of 122444 (RE node)) = {'XA': 0}
== Parents == 
Hidden sample descendants of 99722 = {'B.1.1.7': 130, 'XA': 0} (excluding top B.1.1.7 node)
Hidden sample descendants of 78638 = {'B.1.177.18': 15, 'XA': 0} (excluding top B.1.177.18 node)


In [6]:
dfXN = df[np.logical_and(df.pango == "XN", df.is_sample)]
dfXAU = df[np.logical_and(df.pango == "XAU", df.is_sample)]

keepXN_XAU = ["SRR20786472", "ERR9626571", "ERR9627436", "ERR9827586"]
inclBA2 = df.loc[["ERR9502469", "ERR9794107", "SRR20777279", "ERR9794253", "ERR9969775", "ERR9653725"], 'node_id']

exclude = list(dfXN.node_id[np.isin(dfXN.index, keepXN_XAU) == False])
exclude += list(dfXAU.node_id[np.isin(dfXAU.index, keepXN_XAU) == False])

arg.plot_pango_subgraph(
    ["XN", "XAU"],
    height=500,
    exclude=exclude,
    include=inclBA2,
    y_axis_scale="time",
    oldest_y_label="2022-01",
    child_levels=0,
    parent_levels=2,
    positions_file="layout_data/XN-XAU-min.json",
    highlight_nodes={'purple': dfXN.node_id, 'cyan': dfXAU.node_id},
)

In [7]:
print("== XAU == ")
xau1 = num_sample_desc(1253246)
xau2 = num_sample_desc(1231548, subtract=[xau1])
# check - this should print out zeros
print("Check - this BA.2 should print zeros for all samples")
xxx = num_sample_desc(1232295, subtract=[xau2, {"BA.2": 3}])

print("== XN == ")
xn1 = num_sample_desc(1202936)
xn2 = num_sample_desc(1201839)
xn3 = num_sample_desc(1161431, subtract=[xn2])
xn4 = num_sample_desc(1233036)
xn5 = num_sample_desc(1125088, subtract=[xn4])
xn6 = num_sample_desc(1061700, subtract=[xn1, xn3, xn5, {"BA.2": 1}])

print("== Root == ")
rt = num_sample_desc(2691001, subtract=[xn6, xxx, {"BA.2": 2}])

print("== Parent == ")
parent = num_sample_desc(1137492, subtract=[rt])

== XAU == 
Hidden sample descendants of 1253246 = {'XAU': 1} (excluding top XAU node)
Hidden sample descendants of 1231548 = {'XAU': 5} (excluding top XAU node)
Check - this BA.2 should print zeros for all samples
Hidden sample descendants of 1232295 = {'BA.2': 0, 'XAU': 0}
== XN == 
Hidden sample descendants of 1202936 = {'XN': 6} (excluding top XN node)
Hidden sample descendants of 1201839 = {'XN': 4} (excluding top XN node)
Hidden sample descendants of 1161431 = {'XN': 3} (excluding top XN node)
Hidden sample descendants of 1233036 = {'XN': 2} (excluding top XN node)
Hidden sample descendants of 1125088 = {'XN': 66} (excluding top XN node)
Hidden sample descendants of 1061700 = {'XN': 33, 'BA.2': 0} (excluding top XN node)
== Root == 
Hidden sample descendants of 2691001 = {'XN': 0, 'BA.2': 2, 'XAU': 0}
== Parent == 
Hidden sample descendants of 1137492 = {'BA.2': 37, 'XN': 0, 'XAU': 0}


In [8]:
colours = ['#332288', '#88CCEE', '#44AA99', '#999933', '#DDCC77']  # from https://personal.sron.nl/~pault/
pangoX = ["XZ", "XAC", "XAD", "XAE", "XAP"]

keep = [
    "SRR19523737", "SRR20775418", "SRR20505514",
    "ERR9216823", "SRR19560135", "SRR19710932", "SRR20019588",
    "SRR19710932", "SRR18434609", "SRR21342521", "ERR9762269", "SRR20572879", "SRR20568702",
    "SRR19495684", "ERR9761341", "SRR19672153",
]

keep_ids = list(df.loc[["SRR19689888", "ERR8146303", "ERR8163061", "SRR19689888"], 'node_id'])

exclude = []
cmap = {}
for c, pX in zip(colours, pangoX):
    df_tmp = df[np.logical_and(df.pango == pX, df.is_sample)]
    exclude += list(df_tmp.node_id[np.isin(df_tmp.index, keep) == False])
    cmap[c] = list(df_tmp.node_id)

arg.plot_pango_subgraph(
    pangoX,
    exclude=exclude,
    y_axis_scale="rank",
    height=600,
    include=keep_ids,
    parent_levels=4,
    child_levels=0,
    highlight_nodes=cmap,
    oldest_y_label="2021-09",
    positions_file="layout_data/XZ-XAC-XAD-XAE-XAP-min.json",
)

In [9]:
print("== XAC ==")
xac1 = num_sample_desc(2695552, subtract=[{"XAC": 2}])
xac2 = num_sample_desc(1241084)
xac3 = num_sample_desc(1223586, subtract=[{"XAC": 1}])
# Check - this should print 0
print("Check that this prints out zero for all samples")
xac4 = num_sample_desc(1219046, subtract=[xac1, xac2, xac3, {"XAC": 2}])

print("= nodes above XAC =")
xxx1 = num_sample_desc(1169150, subtract=[xac4])
xxx2 = num_sample_desc(2699802, subtract=[xxx1])

print("== XAE ==")
xae1 = num_sample_desc(1180055)
xae2 = num_sample_desc(1201525, subtract=[xae1])
xae3 = num_sample_desc(1128856)
xae4 = num_sample_desc(1118099, subtract=[xae2, xae3], name="XAE root")

print("== XAP ==")
xap1 = num_sample_desc(1216577)
xap2 = num_sample_desc(1216836, subtract=[xap1, {"XAP": 1}], name="XAP root")

print("== XAD + ==")
xxx3 = num_sample_desc(2703959, subtract=[{"XAD": 1}])
xxx4 = num_sample_desc(1192387, subtract=[xxx3, xap2])

print("== XZ ==")
xz1 = num_sample_desc(1225255)
xz2 = num_sample_desc(1228738)
xz3 = num_sample_desc(1163537, subtract=[xz1, xz2, {"XZ": 1}], name="XZ root")

print("= nodes above XZ =")
xxx5 = num_sample_desc(1112147, subtract=[xz3, {"XAD": 1}])
xxx6 = num_sample_desc(1040907, subtract=[xxx5, xxx4, {"BA.2": 1}, xae4, xxx2])

print("== RE node ==")
rt = num_sample_desc(964555, subtract=[xxx6, {"BA.2": 2}])


print("== right parent ==")
rgt_parent = num_sample_desc(955784, subtract=[rt])



== XAC ==
Hidden sample descendants of 2695552 = {'XAC': 4}
Hidden sample descendants of 1241084 = {'XAC': 2} (excluding top XAC node)
Hidden sample descendants of 1223586 = {'XAC': 6}
Check that this prints out zero for all samples
Hidden sample descendants of 1219046 = {'XAC': 0}
= nodes above XAC =
Hidden sample descendants of 1169150 = {'BA.2': 16, 'XAC': 0}
Hidden sample descendants of 2699802 = {'BA.2': 3, 'XAC': 0}
== XAE ==
Hidden sample descendants of 1180055 = {'XAE': 1} (excluding top XAE node)
Hidden sample descendants of 1201525 = {'XAE': 3}
Hidden sample descendants of 1128856 = {'XAE': 1} (excluding top XAE node)
Hidden sample descendants of 1118099 (XAE root)) = {'XAE': 1} (excluding top XAE node)
== XAP ==
Hidden sample descendants of 1216577 = {'XAP': 2} (excluding top XAP node)
Hidden sample descendants of 1216836 (XAP root)) = {'XAP': 16}
== XAD + ==
Hidden sample descendants of 2703959 = {'BA.2': 7, 'XAD': 0}
Hidden sample descendants of 1192387 = {'BA.2': 49, 'XAP

In [None]:
# lft_parent (863361) has a huge number of descendants, so we calculate this a different way 

from tqdm.auto import tqdm
samp = set()
for tree in tqdm(ts.trees()):
    samp.update(set([u for u in tree.samples(863361)]))
print(f"Hidden sample descendants of 863361 = {len(samp) - sum(rt.values())}")

  0%|          | 0/317 [00:00<?, ?it/s]