In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

import tszip
import sc2ts

In [2]:
data_dir = Path("../data")

In [3]:
recomb_df = pd.read_csv(data_dir / "recombinants.csv")
recomb_df.head(1)

Unnamed: 0,recombinant,sample_id,num_descendant_samples,num_samples,distinct_sample_pango,interval_left,interval_right,num_mutations,Viridian_amplicon_scheme,Artic_primer_version,...,parent_mrca_pango,parent_mrca_scorpio,parent_mrca_time,parent_mrca_date,is_rebar_recombinant,parent_pangonet_distance,net_min_supporting_loci_lft,net_min_supporting_loci_rgt,net_min_supporting_loci_lft_rgt_ge_4,k1000_muts
0,1280342,ERR9939974,1,1,1,695,958,1,COVID-ARTIC-V4.1,.,...,B.1.1.529,Probable Omicron (Unassigned),970.041636,2020-06-26,False,5,2,16,False,8


In [4]:
ts = tszip.decompress(data_dir / "sc2ts_viridian_v1.2.trees.tsz")

In [5]:
samples = recomb_df.sample_id.to_list()
#samples

In [6]:
node_df = sc2ts.node_data(ts)
sample_nodes = node_df[node_df.sample_id.isin(samples)].node_id.to_list()
#sample_nodes

### MNS (A507T-T508C-G509A)

In [7]:
coords = [507, 510]

len_region = coords[1] - coords[0]
gt = np.zeros((len_region, len(recomb_df)), dtype=np.int32)

i = 0
for var in ts.variants(samples=sample_nodes, left=coords[0], right=coords[1]):
    print(var.position, var.alleles)
    gt[i, :] = var.genotypes
    i += 1

507.0 ('A', 'G', 'T', 'C')
508.0 ('T', 'A', 'C', 'G')
509.0 ('G', 'A', 'T', 'C')


In [8]:
bases = [2, 2, 1]
is_mns = np.logical_and.reduce([gt[i, :] == bases[i] for i in range(len_region)])

In [9]:
# How many recombination events are associated with this MNS?
print(sum(is_mns))

9


In [10]:
# How many with this MNS failed QC?
print(sum(np.logical_and(is_mns, ~recomb_df.net_min_supporting_loci_lft_rgt_ge_4)))

6


### Delta lineage-defining deletion

In [11]:
coords = [22029, 22035]

len_region = coords[1] - coords[0]
gt = np.zeros((len_region, len(recomb_df)), dtype=np.int32)
index_del = np.zeros(len_region, dtype=np.int32)

i = 0
for var in ts.variants(samples=sample_nodes, left=coords[0], right=coords[1]):
    print(var.position, var.alleles)
    index_del[i] = var.alleles.index('-')
    gt[i, :] = var.genotypes
    i += 1

index_del

22029.0 ('A', '-', 'G', 'C', 'T')
22030.0 ('G', '-', 'T', 'A', 'C')
22031.0 ('T', 'G', '-', 'C', 'A')
22032.0 ('T', 'C', '-', 'G', 'A')
22033.0 ('C', '-', 'A', 'T', 'G')
22034.0 ('A', '-', 'G', 'C')


array([1, 1, 2, 2, 1, 1], dtype=int32)

In [12]:
has_del = np.logical_and.reduce([gt[i, :] == index_del[i] for i in range(len_region)])

In [13]:
# How many recombination events are associated with this deletion?
print(sum(has_del))

151


In [14]:
# How many with this deletion failed QC?
num_events_qc_fail_del = sum(has_del & ~recomb_df.net_min_supporting_loci_lft_rgt_ge_4)
num_events_qc_fail_del

71

In [15]:
# How many recombination events failed QC?
num_events_qc_fail = sum(~recomb_df.net_min_supporting_loci_lft_rgt_ge_4)
num_events_qc_fail

501

In [16]:
num_events_qc_fail_del / num_events_qc_fail

0.14171656686626746

### AmpliSeq

In [17]:
# How many QC-failing recombination events are associated with AmpliSeq?
num_events_qc_fail_ampliseq = sum(np.logical_and(
    recomb_df.Viridian_amplicon_scheme == "COVID-AMPLISEQ-V1",
    ~recomb_df.net_min_supporting_loci_lft_rgt_ge_4,
))
num_events_qc_fail_ampliseq

335

In [18]:
num_events_qc_fail_ampliseq / num_events_qc_fail

0.6686626746506986