# Investigating the major recombination nodes in the Viridian ARG

In [1]:
import collections

import sc2ts
import numpy as np
import tskit
import numpy as np
import tskit_arg_visualizer as argviz  # Install latest within a notebook cell using `!pip install git+https://github.com/kitchensjn/tskit_arg_visualizer`

import nb_utils

from IPython.display import HTML
HTML('<style type="text/css">.progress .progress-bar::after{content:"🦠";display:block;text-align:right;margin-top:-2px;}'
     '.progress .progress-bar {background-color: #BBBBFF}</style>')

In [2]:
# Get the Viridian ARG
ts = nb_utils.load()
#ts = nb_utils.remove_single_descendant_re_nodes(ts)
ti = sc2ts.TreeInfo(ts)

Loaded 2021.5 megabyte SARS-CoV2 genealogy of 1229949 strains (348 trees, 1906055 mutations over 29904.0 basepairs). Last collection date is 2023-02-20


Counting descendants :   0%|          | 0/1436796 [00:00<?, ?it/s]

Indexing metadata    :   0%|          | 0/1436796 [00:00<?, ?it/s]

Classifying mutations:   0%|          | 0/1906055 [00:00<?, ?it/s]

In [3]:
oldest_imputed = nb_utils.oldest_imputed(ts)

Find oldest node for imputed Pangos:   0%|          | 0/1436796 [00:00<?, ?it/s]

## Major recombination nodes (> 100 descendants)

We define "major" recombination nodes as those with more than 100 descendants. Here they are:

In [4]:
import string
re_nodes = {u: 0 for u in np.where(ts.nodes_flags & sc2ts.NODE_IS_RECOMBINANT)[0]}
for tree in ts.trees():
    for u in re_nodes:
        if re_nodes[u] < tree.num_samples(u):
            re_nodes[u] = tree.num_samples(u)

for i, (u, c) in enumerate(sorted(re_nodes.items(), key = lambda x: -x[1])):
    children = np.unique(ts.edges_child[ts.edges_parent == u])
    grandchildren = np.unique(ts.edges_child[np.isin(ts.edges_parent, children)])
    greatgrandchildren = np.unique(ts.edges_child[np.isin(ts.edges_parent, grandchildren)])
    pango = {ts.node(c).metadata["Imputed_Viridian_pangolin"] for c in set(children) | set(grandchildren) | set(greatgrandchildren)}
    pango -= {"Unknown", "Unknown (R)"}
    print(f"- ({string.ascii_lowercase[i]}) {u} has {c} descendants", str(nb_utils.date(ts, u)).split()[0], pango)
    if i == 15:
        break

- (a) 200039 has 538258 descendants 2020-10-15 {'B.1.617.2'}
- (b) 822854 has 284834 descendants 2021-11-26 {'BA.2', 'BA.2.16', 'BA.4', 'XM', 'BA.5.2.1'}
- (c) 1189192 has 112781 descendants 2022-03-06 {'BA.5', 'BA.5.5.3', 'XAZ', 'BA.5.1.30', 'BA.5.2', 'BA.5.1', 'BA.5.10', 'BA.5.1.19', 'BA.5.6', 'BA.5.3.4', 'BA.5.1.16', 'BE.1', 'BA.5.1.2', 'BA.5.1.22', 'BA.5.1.26', 'BA.5.2.1', 'BA.5.1.35', 'BA.5.3.3', 'BA.5.1.15', 'BA.5.5.1', 'BA.5.5', 'BA.5.3.2', 'BA.5.3', 'BA.5.1.1', 'BA.5.3.1', 'BA.5.11', 'BA.5.1.3', 'BE.3', 'BA.5.1.17', 'BA.5.3.5', 'BA.5.1.6', 'BA.5.9', 'BF.8'}
- (d) 1030562 has 112298 descendants 2022-01-20 {'BA.2', 'BA.5.11', 'BA.5', 'BA.5.5', 'BA.5.9', 'BA.5.3', 'BA.5.2.1', 'BA.5.1'}
- (e) 1396207 has 5080 descendants 2022-07-29 {'XBB', 'XBB.2.8', 'XBB.2.11.1', 'XBB.1.3', 'XBB.4', 'XBB.2.4', 'XBB.2.7', 'XBB.2.6', 'XBB.9', 'XBB.8', 'XBB.2.5', 'XBB.2', 'XBB.1'}
- (f) 293131 has 1900 descendants 2021-06-14 {'AY.124', 'AY.116.1', 'AY.4.3', 'AY.120.1', 'AY.4', 'B.1.617.2', 'AY.114', 

### Identifying artifactual major recombinants

Some of these major recombinants are artifactual, with long runs of adjacent or near-adjacent changes (e.g. deletions or multi-site mutations) causing the HMM to wrongly infer recombination. These are often evident if the recombination patterns contain support on one side from only a few mutations (e.g. less than 8) , many of which are adjacent or near-adjacent. We check them by hand below

In [5]:
NodeReport = collections.namedtuple("NodeReport", nb_utils.NODE_REPORT_KEYS)
recombinant_data = {}
for i, (u, c) in enumerate(sorted(re_nodes.items(), key = lambda x: -x[1])):
    if c > 100:
        display(HTML(f"<h3>{string.ascii_lowercase[i]}. Node {u} with {c} descendants</h3>"))
        recombinant_data[u] = NodeReport(*ti.node_report(u))
        display(recombinant_data[u].copying_pattern)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
pos,210,3457,4965,5184,11201,17523,20396,21618,21895,22022,22917,22995,23012,23604,24410,24775,25276,25469,26256,26767,27638,27752,28461,28881,29402,29668,29742
ref,G,C,C,C,A,G,A,C,T,G,T,C,G,C,G,A,C,C,C,T,T,C,A,G,G,C,G
P0,G,C,C,T,A,G,A,C,T,G,T,C,G,C,G,A,C,C,T,T,T,C,A,G,G,T,G
C,T,C,C,T,A,G,A,G,T,G,G,A,G,G,A,A,C,T,C,C,C,T,G,T,T,C,T
P1,G,T,T,C,G,T,G,C,C,A,G,C,C,G,G,T,T,T,C,T,C,C,A,T,T,C,T
mut,G>T,,,,,,,C>G,,,T>G,C>A,,C>G,G>A,,,,,T>C,,C>T,A>G,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69
pos,670,2790,2832,3241,4184,4321,5386,5924,8393,9344,9424,9534,10029,10198,10447,10449,11537,12880,13195,15714,17410,18163,21618,21762,21846,22200,22578,22673,22674,22679,22686,22688,22775,22813,22992,22995,23013,23040,23048,23055,23063,23075,23202,23525,23599,23604,23854,24130,24424,24469,24503,25000,25584,26060,26270,26530,26577,26709,26858,27259,27382,27383,27807,28271,28311,28881,28882,28883,29510
ref,T,C,A,C,G,C,T,G,G,C,A,C,C,C,G,C,A,C,T,C,C,A,C,C,C,T,G,T,C,T,C,A,G,G,G,C,A,A,G,A,A,T,C,C,T,C,C,C,A,T,C,C,C,C,C,A,C,G,C,A,G,A,C,A,C,G,G,G,A
P0,T,C,A,C,G,C,T,G,G,T,A,C,C,C,G,C,A,C,T,C,C,A,C,C,C,T,G,T,C,T,C,A,G,G,G,C,A,A,G,A,A,T,C,C,T,C,C,C,A,T,C,C,C,C,C,A,C,G,C,A,G,A,C,A,C,G,G,G,A
C,G,T,A,C,A,T,T,G,G,T,G,T,T,T,A,A,A,T,T,T,T,G,T,C,C,G,A,T,T,C,T,G,A,T,A,A,C,G,G,G,T,C,C,T,G,A,A,C,T,A,C,T,T,T,T,A,G,A,T,C,C,T,T,T,T,A,A,C,C
P1,T,C,G,T,G,C,G,A,A,C,A,C,T,C,G,A,G,C,C,C,C,G,C,T,T,T,A,C,T,C,T,A,G,T,A,A,C,G,A,G,T,C,A,T,G,A,C,A,T,A,T,T,T,C,T,G,G,A,C,C,G,A,T,T,T,A,A,C,A
mut,T>G,C>T,,,G>A,C>T,,,,,A>G,C>T,C>T,C>T,G>A,C>A,,C>T,,C>T,C>T,A>G,C>T,,,T>G,G>A,,,,,A>G,G>A,,,,,,A>G,,,,A>C,,,,C>A,A>C,,,T>C,,,C>T,,G>A,,,C>T,,G>C,A>T,,,,,,,A>C


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
pos,9866,12160,14277,16342,22917,23018,23040,26529,26858,27259,27382,27383,27788,27889,28724,29754
ref,C,G,G,T,T,T,A,G,C,A,G,A,G,C,C,C
P0,C,A,G,T,G,G,A,G,T,C,C,T,T,C,T,C
C,C,A,G,T,G,G,A,A,C,A,G,A,G,T,C,T
P1,T,G,T,C,T,T,G,G,C,C,G,A,G,C,C,T
mut,,,,,,,,G>A,,C>A,,,,C>T,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
pos,44,670,2470,2790,2832,4184,4321,5386,7926,8393,9344,9424,9534,9866,10198,10447,11537,12880,13195,15714,16342,17410,19955,20055,21618,21762,21846,22195,22197,22198,22200,22202,22204,22673,22688,22775,22898,23048,23202,24130,24503,26060,26530,26858,27382,27383,29510,29754
ref,C,T,C,C,A,G,C,T,C,G,C,A,C,C,C,G,A,C,T,C,T,C,C,A,C,C,C,T,T,A,T,C,T,T,A,G,G,G,C,C,C,C,A,C,G,A,A,C
P0,T,G,C,T,A,A,T,T,C,G,T,G,T,T,T,A,A,T,T,T,C,T,T,G,T,C,C,T,T,A,G,C,T,T,G,A,G,G,C,C,C,T,A,T,C,T,C,C
C,T,G,C,T,A,A,T,T,C,G,T,G,T,T,T,A,A,T,T,T,C,T,T,G,T,C,C,T,T,A,G,C,T,T,G,A,G,G,C,C,C,T,A,T,G,A,A,T
P1,C,T,T,C,G,G,C,G,T,A,C,A,C,C,C,G,G,C,C,C,T,C,C,A,C,T,T,G,G,C,T,A,C,C,A,G,A,A,A,A,T,C,G,C,G,A,A,T
mut,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
pos,44,261,405,3796,3927,4586,5183,12444,15939,16342,17859,18583,21810,22000,22001,22016,22033,22109,22190,22200,22331,22577,22664,22895,22896,22898,22942,23019,23031,23040,25416,25700,26275
ref,C,G,A,C,C,C,C,A,T,T,T,G,T,C,A,T,C,C,A,T,G,G,C,G,T,G,T,T,T,A,C,C,A
P0,C,G,A,C,C,C,C,A,T,C,T,G,T,C,A,T,C,C,A,G,G,G,C,G,T,G,T,T,T,G,C,T,A
C,C,G,G,C,C,C,C,A,C,C,C,G,C,A,A,T,C,G,A,A,G,C,A,C,C,A,G,C,C,A,T,C,G
P1,T,A,A,T,T,T,T,G,T,T,T,A,T,C,G,C,A,C,G,G,A,C,C,G,T,A,G,C,C,A,T,C,G
mut,,,A>G,,,,,,T>C,,T>C,,T>C,C>A,,,,C>G,,G>A,,,C>A,G>C,T>C,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,826,884,4321,4720,10271,16914,19677,22027,22030,22031,22032,22033,22034,24325,27338
ref,T,C,C,G,G,G,G,T,G,T,T,C,A,A,A
P0,T,T,C,T,G,T,T,T,G,T,T,C,A,T,A
C,T,T,C,T,G,T,T,G,A,A,G,T,G,A,A
P1,C,C,T,G,A,G,G,G,A,A,G,T,G,A,T
mut,,,,,,,,,,,,,,,T>A


0,1,2,3,4,5,6,7,8,9,10,11,12,13
pos,884,4720,7851,14829,16914,17193,19677,22027,22030,22031,22032,22033,22034
ref,C,G,C,G,G,G,G,T,G,T,T,C,A
P0,C,G,T,T,G,T,G,T,G,T,T,C,A
C,C,G,T,T,G,T,G,G,A,A,G,T,G
P1,T,T,C,G,T,G,T,G,A,A,G,T,G
mut,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,912,2842,9559,17122,22026,22027,22030,22031,22032,22033,22034,22044,22639,27406,28007
ref,C,C,C,G,G,T,G,T,T,C,A,C,C,C,T
P0,A,C,C,G,T,G,A,A,G,T,G,C,T,T,T
C,C,C,T,T,T,G,A,A,G,T,G,A,C,C,C
P1,C,T,T,T,G,T,G,T,T,C,A,A,C,C,C
mut,A>C,,C>T,G>T,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55
pos,44,670,686,687,689,691,692,694,1627,2790,2832,4184,4321,5386,5826,6979,8393,9344,9424,9534,10198,10447,11537,12160,12880,13195,15714,17410,19955,20055,21618,21762,21846,22110,22200,22673,22688,22775,22898,22917,23018,23040,23048,23202,24130,24503,26060,26529,26530,27038,27259,27889,28330,29510,29754
ref,C,T,A,A,T,A,T,T,C,C,A,G,C,T,C,T,G,C,A,C,C,G,A,G,C,T,C,C,C,A,C,C,C,A,T,T,A,G,G,T,T,A,G,C,C,C,C,G,A,A,A,C,A,A,C
P0,C,T,G,C,C,G,C,A,C,C,G,G,C,G,C,T,A,C,A,C,C,G,G,G,C,C,C,C,C,A,C,T,T,A,T,C,A,G,A,T,T,G,A,A,A,T,C,G,G,A,C,C,A,A,C
C,C,G,G,C,C,G,C,A,T,T,A,A,T,T,T,G,G,T,G,T,T,A,A,A,T,T,T,T,T,G,T,C,C,T,G,T,G,A,G,G,G,A,G,C,C,C,T,A,A,G,A,T,G,C,T
P1,T,G,A,A,T,A,T,T,T,T,A,A,T,T,T,G,G,T,G,T,T,A,A,A,T,T,T,T,T,G,T,C,C,T,G,T,G,A,G,G,G,A,G,C,C,C,T,A,A,G,A,T,G,C,T
mut,,T>G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,44,686,687,689,691,692,694,1627,5826,6979,22110,27038,27438,28330,29666
ref,C,A,A,T,A,T,T,C,C,T,A,A,T,A,C
P0,C,G,C,C,G,C,A,T,T,G,T,G,T,G,C
C,C,G,C,C,G,C,A,C,C,T,A,A,C,A,T
P1,T,A,A,T,A,T,T,C,C,T,A,A,C,A,T
mut,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
pos,670,2790,2832,3241,4184,4321,5386,5924,8393,9344,9424,9534,10198,10447,11537,12880,13195,15714,17410,19955,20055,21618,21762,21846,22200,22224,22673,22688,22775,22813,23048,23202,23664,24130,24503,26060,26530,26858,27382,27383,29510
ref,T,C,A,C,G,C,T,G,G,C,A,C,C,G,A,C,T,C,C,C,A,C,C,C,T,C,T,A,G,G,G,C,C,C,C,C,A,C,G,A,A
P0,T,C,G,T,G,C,G,A,A,C,A,C,C,G,G,C,C,C,C,C,A,C,T,T,T,T,C,A,G,G,A,A,T,A,T,C,G,C,G,A,A
C,T,C,G,T,G,C,G,A,A,C,A,C,C,G,A,T,T,T,T,T,G,T,C,C,G,C,T,G,A,T,G,C,C,C,C,T,A,T,C,T,C
P1,G,T,A,C,A,T,T,G,G,T,G,T,T,A,A,T,T,T,T,T,G,T,C,C,G,C,T,G,A,T,G,C,C,C,C,T,A,T,C,T,C
mut,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
pos,44,686,687,689,691,692,694,7042,8140,14599,23535,26529,26858,27259,27382,27383,27438,27788,27889,28724,29666,29754
ref,C,A,A,T,A,T,T,G,C,C,A,G,C,A,G,A,T,G,C,C,C,C
P0,C,G,C,C,G,C,A,G,C,C,A,A,C,A,G,A,C,G,T,C,T,T
C,C,G,C,C,G,C,A,T,T,T,G,G,T,C,C,T,T,T,C,T,C,C
P1,T,A,A,T,A,T,T,T,T,T,G,G,T,C,C,T,T,T,C,T,C,C
mut,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
pos,5730,22195,22197,22198,22202,22204,22673,22674,22679,22686,22898,23599,23854,26709
ref,C,T,T,A,C,T,T,C,T,C,G,T,C,G
P0,C,T,T,A,C,T,C,T,C,T,G,T,C,G
C,C,T,T,A,C,T,T,C,T,C,A,G,A,A
P1,T,G,G,C,A,C,T,C,T,C,A,G,A,A
mut,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
pos,261,625,1627,3339,3796,3927,4586,5006,5183,6070,8692,9866,12160,12444,15451,18583,22001,22016,22033,22190,22331,22577,22898,22917,22942,23018,23019,23031,25416,26275,26529,26813,26858,27038,27259,27382,27383,27889,28330,29754
ref,G,G,C,T,C,C,C,A,C,C,C,C,G,A,G,G,A,T,C,A,G,G,G,T,T,T,T,T,C,A,G,T,C,A,A,G,A,C,A,C
P0,G,T,T,T,C,C,C,A,C,C,C,C,A,A,G,G,A,T,C,A,G,G,G,G,T,G,T,T,C,A,A,C,C,G,A,G,A,T,G,T
C,G,T,T,C,C,C,C,A,C,T,T,T,G,G,A,A,G,C,A,G,A,C,A,T,G,C,C,C,T,G,G,T,T,A,C,C,T,C,A,C
P1,A,G,C,T,T,T,T,G,T,C,C,T,G,G,A,A,G,C,A,G,A,C,A,T,G,C,C,C,T,G,G,T,T,A,C,C,T,C,A,C
mut,,,,T>C,,,,,,C>T,C>T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
pos,670,2453,2790,2832,3241,4184,4321,5386,5924,8393,9344,9424,9534,10198,10447,11537,12880,13195,15714,17410,19955,20055,21618,21762,21846,22200,22673,22688,22775,22898,23048,23202,23664,24130,24503,26060,26530,26858,27382,27383,29510
ref,T,C,C,A,C,G,C,T,G,G,C,A,C,C,G,A,C,T,C,C,C,A,C,C,C,T,T,A,G,G,G,C,C,C,C,C,A,C,G,A,A
P0,G,C,T,A,C,A,T,T,G,G,T,G,T,T,A,A,T,T,T,T,T,G,T,C,C,G,T,G,A,G,G,C,C,C,C,T,A,T,C,T,C
C,G,C,T,A,C,A,T,T,G,G,T,G,T,T,A,A,T,T,T,T,T,G,T,C,C,G,T,G,A,G,G,C,C,C,C,C,G,C,G,A,A
P1,T,T,C,G,T,G,C,G,A,A,C,A,C,C,G,G,C,C,C,C,C,A,C,T,T,T,C,A,G,A,A,A,T,A,T,C,G,C,G,A,A
mut,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12
pos,2110,7984,14120,18687,19390,21462,22264,27880,27881,27882,27883,28737
ref,C,T,C,C,C,T,C,A,C,G,C,C
P0,T,C,T,C,T,T,C,A,C,G,C,C
C,T,C,T,C,T,T,C,T,T,C,T,T
P1,C,T,C,T,C,C,T,T,T,C,T,C
mut,,,,,,,,,,,,C>T


### Detailed list of artifactual major recombinants.

From above, it appears that the following are artifactual:

In [6]:
bad_edges = {}

display(HTML(
    "<h2>Artifactual Recombination nodes</h2>"
    "<h3>Node 1030562 (d)</h3>"
    "This node, primarily associated with BA.2/BA.5 samples, only involves 4 mutations on the right, 2 of which are adjacent" +
    recombinant_data[1030562].copying_pattern.data
))
bad_edges[1030562] = "right"

display(HTML(
    "<h3>Nodes 293131, 295320, and 293126 (f, g, h)</h3>"
    "These are all decendants of Delta (AY.xx). They all share flip-flopping at positions between 22026 and 22034, "
    "which could be a deletion or sequencing/primer error. See https://github.com/jeromekelleher/sc2ts-paper/issues/258."
    "<table><tr><th>293131</th><th>295320</th><th>293126</th></tr>"
    "<tr><td>8 mutations on the right, 6 of which are the problem positions</td>"
    "<td>All right hand mutations are the problem positions</td>"
    "<td>Middle mutations are the problem positions. Far left and far are not so bad, so possible worth further investigation?</td>"
    f"<tr><td>{recombinant_data[293131].copying_pattern.data}</td><td>{recombinant_data[295320].copying_pattern.data}</td><td>{recombinant_data[293126].copying_pattern.data}</td></tr></table>"
))
bad_edges[293131] = "right"
bad_edges[295320] = "right"
bad_edges[293126] = "?"  # We will skip this anyway

display(HTML(
    "<h3>Nodes 1253364, 1279026, and 1338895 (i, j, l)</h3>"
    "These nodes, primarily associated with BA.4.6 and BA.5.X samples, involve positions 686, 687, 689, 691, 692, 694 on the LHS"
    "<table><tr><th>1253364</th><th>1279026</th><th>1338895</th></tr>"
    "<tr><td>Occurs along the BA.5.2.1 lineage</td>"
    "<td>Occurs along the BA.5.2.1 lineage</td>"
    "<td>An immediate child of 1279026, and involves BA.4.6</td></tr>"
    f"<tr><td>{recombinant_data[1253364].copying_pattern.data}</td><td>{recombinant_data[1279026].copying_pattern.data}</td><td>{recombinant_data[1338895].copying_pattern.data}</td></tr></table>"
))
bad_edges[1253364] = "left"
bad_edges[1279026] = "left"
bad_edges[1338895] = "left"


display(HTML(
    "<h3>Node 871356 (m)</h3>This node, primarily associated with BA.1.1 samples, has 6 mutations on the left, 5 of which are adjacent (positions 22195, 22197, 22198, 22202, 22204)"
    f"{recombinant_data[871356].copying_pattern.data}"
))
bad_edges[871356] = "left"

display(HTML(
    "<h3>Node 179752 (p)</h3>This node, primarily associated with Alpha (B.1.1.7) samples, has 5 mutations on the left, 5 of which are adjacent (positions 27880	27881	27882	27883)"
    f"{recombinant_data[179752].copying_pattern.data}"
))
bad_edges[179752] = "left"

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
pos,44,670,2470,2790,2832,4184,4321,5386,7926,8393,9344,9424,9534,9866,10198,10447,11537,12880,13195,15714,16342,17410,19955,20055,21618,21762,21846,22195,22197,22198,22200,22202,22204,22673,22688,22775,22898,23048,23202,24130,24503,26060,26530,26858,27382,27383,29510,29754
ref,C,T,C,C,A,G,C,T,C,G,C,A,C,C,C,G,A,C,T,C,T,C,C,A,C,C,C,T,T,A,T,C,T,T,A,G,G,G,C,C,C,C,A,C,G,A,A,C
P0,T,G,C,T,A,A,T,T,C,G,T,G,T,T,T,A,A,T,T,T,C,T,T,G,T,C,C,T,T,A,G,C,T,T,G,A,G,G,C,C,C,T,A,T,C,T,C,C
C,T,G,C,T,A,A,T,T,C,G,T,G,T,T,T,A,A,T,T,T,C,T,T,G,T,C,C,T,T,A,G,C,T,T,G,A,G,G,C,C,C,T,A,T,G,A,A,T
P1,C,T,T,C,G,G,C,G,T,A,C,A,C,C,C,G,G,C,C,C,T,C,C,A,C,T,T,G,G,C,T,A,C,C,A,G,A,A,A,A,T,C,G,C,G,A,A,T
mut,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


293131,295320,293126
"8 mutations on the right, 6 of which are the problem positions",All right hand mutations are the problem positions,"Middle mutations are the problem positions. Far left and far are not so bad, so possible worth further investigation?"
pos826884432147201027116914196772202722030220312203222033220342432527338refTCCGGGGTGTTCAAAP0TTCTGTTTGTTCATACTTCTGTTGAAGTGAAP1CCTGAGGGAAGTGATmutT>A,pos8844720785114829169141719319677220272203022031220322203322034refCGCGGGGTGTTCAP0CGTTGTGTGTTCACCGTTGTGGAAGTGP1TTCGTGTGAAGTGmut,pos91228429559171222202622027220302203122032220332203422044226392740628007refCCCGGTGTTCACCCTP0ACCGTGAAGTGCTTTCCCTTTGAAGTGACCCP1CTTTGTGTTCAACCCmutA>CC>TG>T

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,826,884,4321,4720,10271,16914,19677,22027,22030,22031,22032,22033,22034,24325,27338
ref,T,C,C,G,G,G,G,T,G,T,T,C,A,A,A
P0,T,T,C,T,G,T,T,T,G,T,T,C,A,T,A
C,T,T,C,T,G,T,T,G,A,A,G,T,G,A,A
P1,C,C,T,G,A,G,G,G,A,A,G,T,G,A,T
mut,,,,,,,,,,,,,,,T>A

0,1,2,3,4,5,6,7,8,9,10,11,12,13
pos,884,4720,7851,14829,16914,17193,19677,22027,22030,22031,22032,22033,22034
ref,C,G,C,G,G,G,G,T,G,T,T,C,A
P0,C,G,T,T,G,T,G,T,G,T,T,C,A
C,C,G,T,T,G,T,G,G,A,A,G,T,G
P1,T,T,C,G,T,G,T,G,A,A,G,T,G
mut,,,,,,,,,,,,,

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,912,2842,9559,17122,22026,22027,22030,22031,22032,22033,22034,22044,22639,27406,28007
ref,C,C,C,G,G,T,G,T,T,C,A,C,C,C,T
P0,A,C,C,G,T,G,A,A,G,T,G,C,T,T,T
C,C,C,T,T,T,G,A,A,G,T,G,A,C,C,C
P1,C,T,T,T,G,T,G,T,T,C,A,A,C,C,C
mut,A>C,,C>T,G>T,,,,,,,,,,,


1253364,1279026,1338895
Occurs along the BA.5.2.1 lineage,Occurs along the BA.5.2.1 lineage,"An immediate child of 1279026, and involves BA.4.6"
pos446706866876896916926941627279028324184432153865826697983939344942495341019810447115371216012880131951571417410199552005521618217622184622110222002267322688227752289822917230182304023048232022413024503260602652926530270382725927889283302951029754refCTAATATTCCAGCTCTGCACCGAGCTCCCACCCATTAGGTTAGCCCCGAAACAACP0CTGCCGCACCGGCGCTACACCGGGCCCCCACTTATCAGATTGAAATCGGACCAACCCGGCCGCATTAATTTGGTGTTAAATTTTTGTCCTGTGAGGGAGCCCTAAGATGCTP1TGAATATTTTAATTTGGTGTTAAATTTTTGTCCTGTGAGGGAGCCCTAAGATGCTmutT>G,pos446866876896916926941627582669792211027038274382833029666refCAATATTCCTAATACP0CGCCGCATTGTGTGCCCGCCGCACCTAACATP1TAATATTCCTAACATmut,pos446866876896916926947042814014599235352652926858272592738227383274382778827889287242966629754refCAATATTGCCAGCAGATGCCCCP0CGCCGCAGCCAACAGACGTCTTCCGCCGCATTTGGTCCTTTCTCCP1TAATATTTTTGGTCCTTTCTCCmut

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55
pos,44,670,686,687,689,691,692,694,1627,2790,2832,4184,4321,5386,5826,6979,8393,9344,9424,9534,10198,10447,11537,12160,12880,13195,15714,17410,19955,20055,21618,21762,21846,22110,22200,22673,22688,22775,22898,22917,23018,23040,23048,23202,24130,24503,26060,26529,26530,27038,27259,27889,28330,29510,29754
ref,C,T,A,A,T,A,T,T,C,C,A,G,C,T,C,T,G,C,A,C,C,G,A,G,C,T,C,C,C,A,C,C,C,A,T,T,A,G,G,T,T,A,G,C,C,C,C,G,A,A,A,C,A,A,C
P0,C,T,G,C,C,G,C,A,C,C,G,G,C,G,C,T,A,C,A,C,C,G,G,G,C,C,C,C,C,A,C,T,T,A,T,C,A,G,A,T,T,G,A,A,A,T,C,G,G,A,C,C,A,A,C
C,C,G,G,C,C,G,C,A,T,T,A,A,T,T,T,G,G,T,G,T,T,A,A,A,T,T,T,T,T,G,T,C,C,T,G,T,G,A,G,G,G,A,G,C,C,C,T,A,A,G,A,T,G,C,T
P1,T,G,A,A,T,A,T,T,T,T,A,A,T,T,T,G,G,T,G,T,T,A,A,A,T,T,T,T,T,G,T,C,C,T,G,T,G,A,G,G,G,A,G,C,C,C,T,A,A,G,A,T,G,C,T
mut,,T>G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
pos,44,686,687,689,691,692,694,1627,5826,6979,22110,27038,27438,28330,29666
ref,C,A,A,T,A,T,T,C,C,T,A,A,T,A,C
P0,C,G,C,C,G,C,A,T,T,G,T,G,T,G,C
C,C,G,C,C,G,C,A,C,C,T,A,A,C,A,T
P1,T,A,A,T,A,T,T,C,C,T,A,A,C,A,T
mut,,,,,,,,,,,,,,,

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
pos,44,686,687,689,691,692,694,7042,8140,14599,23535,26529,26858,27259,27382,27383,27438,27788,27889,28724,29666,29754
ref,C,A,A,T,A,T,T,G,C,C,A,G,C,A,G,A,T,G,C,C,C,C
P0,C,G,C,C,G,C,A,G,C,C,A,A,C,A,G,A,C,G,T,C,T,T
C,C,G,C,C,G,C,A,T,T,T,G,G,T,C,C,T,T,T,C,T,C,C
P1,T,A,A,T,A,T,T,T,T,T,G,G,T,C,C,T,T,T,C,T,C,C
mut,,,,,,,,,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
pos,5730,22195,22197,22198,22202,22204,22673,22674,22679,22686,22898,23599,23854,26709
ref,C,T,T,A,C,T,T,C,T,C,G,T,C,G
P0,C,T,T,A,C,T,C,T,C,T,G,T,C,G
C,C,T,T,A,C,T,T,C,T,C,A,G,A,A
P1,T,G,G,C,A,C,T,C,T,C,A,G,A,A
mut,,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12
pos,2110,7984,14120,18687,19390,21462,22264,27880,27881,27882,27883,28737
ref,C,T,C,C,C,T,C,A,C,G,C,C
P0,T,C,T,C,T,T,C,A,C,G,C,C
C,T,C,T,C,T,T,C,T,T,C,T,T
P1,C,T,C,T,C,C,T,T,T,C,T,C
mut,,,,,,,,,,,,C>T


## Subgraph viz

Cell below loads the full ARG into a d3arg instance (takes a minute or so), which can later be used to plot subgraphs.

In [7]:
d3arg = argviz.D3ARG.from_ts(ts, progress=True)
nb_utils.set_sc2ts_labels_and_styles(d3arg, ts)

Edges:   0%|          | 0/1437724 [00:00<?, ?it/s]

Sites:   0%|          | 0/29803 [00:00<?, ?it/s]

Nodes:   0%|          | 0/1436796 [00:00<?, ?it/s]

Setting all labels:   0%|          | 0/1436796 [00:00<?, ?it/s]

In [8]:
# Uncomment the cell below to look at individual subgraphs for artifactual nodes 
#nb_utils.plot_sc2ts_subgraph(d3arg, 1338895, height=1500, child_levels=2)

## Plot the unedited recombinant backbone

For a start, plot the recombinant nodes, skipping those that appear artifactual. Some of the artifactual nodes will still be plotted, if they are parents to real recombination nodes (we colour these in red). Others, like the the artifactual AY clusters, are not parents of other recombination nodes, and so will not appear in the subgraph

In [9]:
from matplotlib import pyplot as plt

key_re_nodes = {
    200039: "ORIG Delta",
    822854: "ORIG BA.2",
    1189192: "ORIG BA.5",
    #1030562: "BA.2 ???",  # Appears to be artifactual
    1396207: "ORIG XBB",
    #293131: "AY cluster 1",  # See https://github.com/jeromekelleher/sc2ts-paper/issues/279
    #295320: "AY cluster 2",  # See https://github.com/jeromekelleher/sc2ts-paper/issues/279
    #293126 has 827 descendants 2021-06-11 {'AY.100', 'B.1.617.2'}
    #1253364 has 596 descendants 2022-05-20 {'BA.5.2.1', 'BA.4.1', 'BA.4', 'BA.5.1', 'BA.5.2'}
    #1279026 has 462 descendants 2022-05-31 {'BA.4.6', 'BE.1', 'BA.4.6.3', 'BA.5.1'}
    965353: "ORIG XE/XH",
    #1338895: "BA.4.6 ???",
    #871356: "BA.1.1+ ??",
    1420385: "ORIG XBF",
    964555: "ORIG XZ++",
    #179752: "B.1.1.7", # Alpha
}

d3arg.set_node_labels(key_re_nodes)
d3arg.nodes.loc[:,"size"] = 50
d3arg.set_node_styles([{'id': k, "size": 300} for k in key_re_nodes])
d3arg.set_node_styles([{'id': k, "fill": "red", "size": 200} for k in bad_edges.keys()])
    
shown_nodes = nb_utils.plot_sc2ts_subgraph(
    d3arg, list(key_re_nodes.keys()), height=1500, child_levels=0,
    cmap=plt.cm.tab20, y_axis_scale="time", return_included_nodes=True
)

Only a single artifactual red node is shown. We can remove this for viz purposes by editing the d3arg to point the "bad" edge to the other parent

In [10]:
for u in shown_nodes:
    if (ts.nodes_flags[u] & sc2ts.NODE_IS_RECOMBINANT) and u in bad_edges:
        print(f"Artifactual recombinant to remove is {u}, edges above this node to merge are")
        for e in np.where(ts.edges_child == u)[0]:
            edge = ts.edge(e)
            print("*", ("adjust" if ((bad_edges[u] == "right") != (edge.left==0)) else "leave"), edge)

Artifactual recombinant to remove is 1030562, edges above this node to merge are
* leave Edge(left=0.0, right=27382.0, parent=902972, child=1030562, metadata=b'', id=np.int64(436893))
* adjust Edge(left=27382.0, right=29904.0, parent=891294, child=1030562, metadata=b'', id=np.int64(448462))


In [11]:
# Edit the edge in the d3ARG
parent_edges = np.where(d3arg.edges.target == 1030562)[0]
assert len(parent_edges) == 2
if d3arg.edges.loc[parent_edges[0]].bounds.startswith("0"):
    change, use = 1, 0
else:
    change, use = 0, 1
assert d3arg.edges.loc[parent_edges[use]].bounds.startswith("0")
assert not d3arg.edges.loc[parent_edges[change]].bounds.startswith("0")
d3arg.edges.loc[parent_edges[change], "source"] = d3arg.edges.loc[parent_edges[use], "source"]
# Set this RE node back to the small size, but keep the colour so we can see it
d3arg.set_node_styles([{'id': 1030562, "size": d3arg.nodes.loc[1, 'size']}])


In [12]:
try:
    d = nb_utils.set_x_01_from_json(d3arg, "Viridian-recombinant-backbone.json")
except FileNotFoundError:
    nb_utils.clear_x_01(d3arg)

nb_utils.plot_sc2ts_subgraph(d3arg, list(key_re_nodes.keys()), height=1500, child_levels=0, cmap=plt.cm.tab20, y_axis_scale="time")

In [13]:
# Or the same on a ranked timescale
try:
    d = nb_utils.set_x_01_from_json(d3arg, "Viridian-recombinant-backbone.json")
except FileNotFoundError:
    nb_utils.clear_x_01(d3arg)

nb_utils.plot_sc2ts_subgraph(d3arg, list(key_re_nodes.keys()), height=1500, child_levels=0, cmap=plt.cm.tab20, y_axis_scale="rank")