### Get matching results for Pango X not in the ARG

https://github.com/jeromekelleher/sc2ts-paper/issues/366

In [1]:
from pathlib import Path
import re
import pandas as pd
import tszip
import sc2ts

In [2]:
data_dir = Path("../data")

In [3]:
metadata_file = data_dir / "run_metadata.v05.tsv.gz"
ts_file = data_dir / "sc2ts_viridian_v1.2.trees.tsz"
matches_file = data_dir / "v1-beta1.matches.tsv"

In [4]:
metadata = pd.read_csv(metadata_file, sep="\t")
metadata = metadata[
    (metadata["In_may_2024_preprint"] == "T") & \
    (metadata["Viridian_result"] == "PASS") & \
    (metadata["In_Viridian_tree"] == "T") & \
    (metadata["Date_tree"] != ".") & \
    (metadata["Date_tree"] != "2020/12/31")
].reset_index(drop=True)
metadata.head(1)

  metadata = pd.read_csv(metadata_file, sep="\t")


Unnamed: 0,In_may_2024_preprint,Study,Sample,Experiment,Run,Run_count,Platform,Country,Region,Collection_date,...,Genbank_N,Viridian_pangolin,Viridian_scorpio,Genbank_pangolin,Genbank_scorpio,Genbank_tree_name,Viridian_cons_len,Viridian_cons_het,Viridian_pangolin_1.29,Viridian_scorpio_1.29
0,T,PRJEB47121,SAMEA9781395,ERX6172603,ERR6546375,1,ILLUMINA,Estonia,none,2021-08-02,...,.,AY.100,Delta (B.1.617.2-like),.,.,.,29810,0,AY.100,Delta (B.1.617.2-like)


In [5]:
ts = tszip.decompress(ts_file)
nodes_df = sc2ts.node_data(ts)
nodes_df.head(5)

Unnamed: 0,pango,sample_id,scorpio,node_id,is_sample,is_recombinant,num_mutations,max_descendant_samples,date
0,B,Vestigial_ignore,.,0,False,False,0,0,2019-11-23
1,B,Wuhan/Hu-1/2019,.,1,False,False,0,2482157,2019-12-26
2,A,SRR11772659,.,2,True,False,1,255,2020-01-19
3,B,SRR11397727,.,3,True,False,0,1,2020-01-24
4,B,SRR11397730,.,4,True,False,0,1,2020-01-24


In [6]:
matches = pd.read_csv(matches_file, sep="\t")
matches.head(5)

Unnamed: 0,strain,n_parents,n_mutations,parents,mutations
0,SRR14631544,1,24,"(0:29904, 1)","[C241T, C3037T, C3267T, C4683T, C5986T, C6471T..."
1,ERR4780180,1,15,"(0:29904, 1)","[G204T, C241T, T445C, C1959T, C3037T, C6286T, ..."
2,SRR11772659,1,3,"(0:29904, 1)","[C8782T, C18060T, T28144C]"
3,SRR11397727,1,1,"(0:29904, 1)",[G26144T]
4,SRR11397730,1,1,"(0:29904, 1)",[G26144T]


In [7]:
def parse_parents(s):
    pattern = r"\((\d+):(\d+), (\d+)\)"
    matches = re.findall(pattern, s)
    result = [
        (int(seg_start), int(seg_end), int(parent_id))
        for seg_start, seg_end, parent_id in matches
    ]
    return result


def get_pango_parent(node_id, df):
    node = df.iloc[node_id]
    return node.pango


def get_info_for(pango):
    samples = metadata[metadata["Viridian_pangolin"] == pango]["Run"].to_list()
    results = matches[matches["strain"].isin(samples)].reset_index(drop=True)
    print(f"Samples in Viridian: {len(samples)}")
    print(f"HMM results in matches: {len(results)}")
    data = []
    for _, row in results.iterrows():
        if row["n_parents"] == 1:
            data.append(
                {
                    "sample": row["strain"],
                    "pango": pango,
                    "num_parents": row["n_parents"],
                    "num_breakpoints": 0,
                    "left_pango_parent": "n/a",
                    "right_pango_parent": "n/a",
                    "left_breakpoint": "n/a",
                    "right_breakpoint": "n/a",
                    "num_mutations": row["n_mutations"],
                }
            )
        elif row["n_parents"] == 2:
            parsed_results = parse_parents(row["parents"])
            left_parent_id = parsed_results[0][2]
            right_parent_id = parsed_results[1][2]
            left_pango_parent = get_pango_parent(left_parent_id, nodes_df)
            right_pango_parent = get_pango_parent(right_parent_id, nodes_df)
            right_bkpt = parsed_results[1][0]
            left_bkpt = sc2ts.utils.compute_left_bound(
                ts=ts,
                parents=[left_parent_id, right_parent_id],
                right=right_bkpt,
            )
            data.append(
                {
                    "sample": row["strain"],
                    "pango": pango,
                    "num_parents": row["n_parents"],
                    "num_breakpoints": 1,
                    "left_pango_parent": left_pango_parent,
                    "right_pango_parent": right_pango_parent,
                    "left_breakpoint": left_bkpt,
                    "right_breakpoint": right_bkpt,
                    "num_mutations": row["n_mutations"],
                }
            )
        else:
            parsed_results = parse_parents(row["parents"])
            left_pango_parent = ""
            left_bkpt = ""
            list_more_pango_parents = []
            list_more_intervals = []
            for i in range(row["n_parents"] - 1):
                tmp_left_parent_id = parsed_results[i][2]
                tmp_right_parent_id = parsed_results[i + 1][2]
                tmp_left_pango_parent = get_pango_parent(tmp_left_parent_id, nodes_df)
                tmp_right_pango_parent = get_pango_parent(tmp_right_parent_id, nodes_df)
                tmp_right_bkpt = parsed_results[i + 1][0]
                tmp_left_bkpt = sc2ts.utils.compute_left_bound(
                    ts=ts,
                    parents=[tmp_left_parent_id, tmp_right_parent_id],
                    right=tmp_right_bkpt,
                )
                if i == 0:
                    left_pango_parent = tmp_left_pango_parent
                    left_bkpt = tmp_left_bkpt
                    list_more_pango_parents.append(tmp_right_pango_parent)
                    list_more_intervals.append(str(tmp_right_bkpt))
                else:
                    list_more_pango_parents.append(tmp_right_pango_parent)
                    list_more_intervals.append(f"{str(tmp_left_bkpt)}-{str(tmp_right_bkpt)}")
            data.append(
                {
                    "sample": row["strain"],
                    "pango": pango,
                    "num_parents": row["n_parents"],
                    "num_breakpoints": row["n_parents"] - 1,
                    "left_pango_parent": left_pango_parent,
                    "right_pango_parent": ";".join(list_more_pango_parents),
                    "left_breakpoint": left_bkpt,
                    "right_breakpoint": ";".join(list_more_intervals),
                    "num_mutations": row["n_mutations"],
                }
            )
    res_df = pd.DataFrame(data)
    return {
        "num_samples_viridian": len(samples),
        "num_samples_matches": len(results),
        "num_breakpoints": pd.unique(res_df.num_breakpoints).tolist() if len(res_df) > 0 else [None],
        "table": res_df,
    }

### Get matching results from matches file

#### XBJ

In [8]:
# https://github.com/jeromekelleher/sc2ts-paper/issues/362
results_xbj2 = get_info_for(pango="XBJ.2")
if results_xbj2["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbj2["num_breakpoints"])
results_xbj2["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10839097,XBJ.2,2,1,BA.2.3.20,BA.5.2,23019,25810,6


In [9]:
results_xbj4 = get_info_for(pango="XBJ.4")
if results_xbj4["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbj4["num_breakpoints"])
results_xbj4["table"]

Samples in Viridian: 1
HMM results in matches: 0
Breakpoints:  [None]


In [10]:
num_samples_viridian_xbj = results_xbj2["num_samples_viridian"] + results_xbj4["num_samples_viridian"]
num_samples_matches_xbj = results_xbj2["num_samples_matches"] + results_xbj4["num_samples_matches"]
breakpoints_xbj = [i for i in results_xbj2["num_breakpoints"] + results_xbj4["num_breakpoints"] if i is not None]
print(f"XBJ: {num_samples_viridian_xbj} samples in Viridian, {num_samples_matches_xbj} matches")

XBJ: 2 samples in Viridian, 1 matches


#### XBP

In [11]:
# Pick more specific Pango label BQ.1.1.3 over BQ.1.5.
results_xbp = get_info_for(pango="XBP")
if results_xbp["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbp["num_breakpoints"])
results_xbp["table"]

Samples in Viridian: 8
HMM results in matches: 2
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10770729,XBP,2,1,BL.2,BQ.1.5,22191,22331,5
1,SRR23608970,XBP,2,1,BL.2,BQ.1.1.3,22191,22331,7


#### XBS

In [12]:
# Pick more specific Pango label BQ.1.1 over BQ.1.
results_xbs = get_info_for(pango="XBS")
if results_xbs["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbs["num_breakpoints"])
results_xbs["table"]

Samples in Viridian: 19
HMM results in matches: 15
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10749106,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,4
1,ERR10779088,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,6
2,ERR10771500,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,6
3,ERR10791804,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
4,ERR10791806,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
5,ERR10792087,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
6,ERR10797807,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
7,ERR10797810,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
8,ERR10797829,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5
9,ERR10797888,XBS,2,1,BA.2.75,BQ.1.1,22034,22190,5


#### XBW

In [13]:
results_xbw = get_info_for(pango="XBW")
if results_xbw["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbw["num_breakpoints"])
results_xbw["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,SRR23359879,XBW,2,1,XBB.1.5.107,BQ.1.14,25417,26275,5


#### XCA

In [14]:
# Identical results across samples
results_xca = get_info_for(pango="XCA")
if results_xca["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xca["num_breakpoints"])
results_xca["table"]

Samples in Viridian: 11
HMM results in matches: 4
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10911716,XCA,2,1,BA.2.75.2,BQ.1.1.1,22748,22893,10
1,SRR23607969,XCA,2,1,BA.2.75.2,BQ.1.1.1,22748,22893,12
2,ERR10911045,XCA,2,1,BA.2.75.2,BQ.1.1.1,22748,22893,11
3,ERR10911252,XCA,2,1,BA.2.75.2,BQ.1.1.1,22748,22893,12


#### XAK (complex)

In [15]:
results_xak = get_info_for(pango="XAK")
if results_xak["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xak["num_breakpoints"])
results_xak["table"]

Samples in Viridian: 2
HMM results in matches: 2
Breakpoints:  [1]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10020060,XAK,2,1,BA.2,BA.2,21642,21762,9
1,SRR20919952,XAK,2,1,BA.2,BA.2,21642,21762,10


#### XAY (complex)

In [16]:
results_xay = get_info_for(pango="XAY")
if results_xay["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xay["num_breakpoints"])
results_xay["table"]

Samples in Viridian: 4
HMM results in matches: 2
Breakpoints:  [6]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,SRR22375991,XAY,7,6,BA.2.9,AY.5;BA.2;AY.4;BA.5.5;AY.4;BA.2,6403,6428;9867-10198;12881-15451;22035-22118;24470-...,12
1,SRR22243770,XAY,7,6,BA.2.9,AY.98;BA.2.10;AY.4;BA.5.1;AY.4;BA.2,6403,7124;9867-10198;12881-15026;21847-22017;24470-...,11


In [17]:
results_xay1 = get_info_for(pango="XAY.1")
if results_xay1["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xay1["num_breakpoints"])
results_xay1["table"]

Samples in Viridian: 10
HMM results in matches: 8
Breakpoints:  [6]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,SRR22743619,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,18
1,SRR22972722,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,19
2,ERR10707172,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2.12.1,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,18
3,SRR23200104,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,21
4,ERR10770411,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2.12.1,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,18
5,SRR23489537,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,21
6,SRR23489545,XAY.1,7,6,BA.2,B.1.617.2;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,23
7,SRR23497420,XAY.1,7,6,BA.2,AY.98;BA.2.16;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9535-10198;12881-15451;22035-22118;24470-...,22


In [18]:
results_xay3 = get_info_for(pango="XAY.3")
if results_xay3["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xay3["num_breakpoints"])
results_xay3["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [6]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,SRR22243763,XAY.3,7,6,BA.2,AY.98;BA.2;AY.4;BA.5.5;AY.4;BA.2,6403,7124;9867-10198;12881-15451;22035-22118;24470-...,11


In [19]:
num_samples_viridian_xay = results_xay["num_samples_viridian"] + results_xay1["num_samples_viridian"] + results_xay3["num_samples_viridian"]
num_samples_matches_xay = results_xay["num_samples_matches"] + results_xay1["num_samples_matches"] + results_xay3["num_samples_matches"]
breakpoints_xay = [i for i in results_xay["num_breakpoints"] + results_xay1["num_breakpoints"] + results_xay3["num_breakpoints"] if i is not None]
print(f"XAY: {num_samples_viridian_xay} samples in Viridian, {num_samples_matches_xay} matches")

XAY: 15 samples in Viridian, 11 matches


#### XBC (complex)

In [20]:
results_xbc1 = get_info_for(pango="XBC.1")
if results_xbc1["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc1["num_breakpoints"])
results_xbc1["table"]

Samples in Viridian: 23
HMM results in matches: 12
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,SRR22255420,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,26
1,SRR22237817,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22579-22591;25470-25584,26
2,SRR22240800,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
3,SRR22239747,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
4,SRR22239750,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
5,SRR22344843,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
6,ERR10489431,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
7,ERR10559811,XBC.1,4,3,BA.2,AY.26;BM.1.1;AY.9,2912,3955;22600-22674;25470-25584,27
8,ERR10740112,XBC.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,28
9,ERR10769965,XBC.1,4,3,BA.2,AY.64;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,28


In [21]:
results_xbc11 = get_info_for(pango="XBC.1.1")
if results_xbc11["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc11["num_breakpoints"])
results_xbc11["table"]

Samples in Viridian: 3
HMM results in matches: 3
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10706446,XBC.1.1,4,3,BA.2,AY.26;BM.1.1;AY.9,2912,4184;22600-22674;25470-25584,31
1,ERR10706969,XBC.1.1,4,3,BA.2,AY.26;BM.1.1;AY.9,2912,4184;22600-22674;25470-25584,32
2,ERR10804802,XBC.1.1,4,3,BA.2,AY.26;BM.1.1;AY.9,2912,4184;22600-22674;25470-25584,35


In [22]:
results_xbc111 = get_info_for(pango="XBC.1.1.1")
if results_xbc111["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc111["num_breakpoints"])
results_xbc111["table"]

Samples in Viridian: 2
HMM results in matches: 1
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10797950,XBC.1.1.1,4,3,BA.2,AY.26;BM.1.1;AY.9,2912,4184;22600-22674;25470-25584,33


In [23]:
results_xbc12 = get_info_for(pango="XBC.1.2")
if results_xbc12["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc12["num_breakpoints"])
results_xbc12["table"]

Samples in Viridian: 2
HMM results in matches: 1
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10932110,XBC.1.2,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29


In [24]:
results_xbc121 = get_info_for(pango="XBC.1.2.1")
if results_xbc121["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc121["num_breakpoints"])
results_xbc121["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10805347,XBC.1.2.1,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,31


In [25]:
results_xbc15 = get_info_for(pango="XBC.1.5")
if results_xbc15["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc15["num_breakpoints"])
results_xbc15["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10667261,XBC.1.5,4,3,BA.2,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,27


In [26]:
results_xbc2 = get_info_for(pango="XBC.2")
if results_xbc2["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbc2["num_breakpoints"])
results_xbc2["table"]

Samples in Viridian: 4
HMM results in matches: 2
Breakpoints:  [3]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10366676,XBC.2,4,3,BA.2.10,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,29
1,ERR10383977,XBC.2,4,3,BA.2.10,B.1.617.2;BM.1.1;AY.9,2791,4184;22600-22674;25470-25584,28


In [27]:
num_samples_viridian_xbc = results_xbc1["num_samples_viridian"] + results_xbc11["num_samples_viridian"] + results_xbc111["num_samples_viridian"] + results_xbc12["num_samples_viridian"] + results_xbc121["num_samples_viridian"] + results_xbc15["num_samples_viridian"] + results_xbc2["num_samples_viridian"]
num_samples_matches_xbc = results_xbc1["num_samples_matches"] + results_xbc11["num_samples_matches"] + results_xbc111["num_samples_matches"] + results_xbc12["num_samples_matches"] + results_xbc121["num_samples_matches"] + results_xbc15["num_samples_matches"] + results_xbc2["num_samples_matches"]
breakpoints_xbc = [i for i in results_xbc1["num_breakpoints"] + results_xbc11["num_breakpoints"] + results_xbc111["num_breakpoints"] + results_xbc12["num_breakpoints"] + results_xbc121["num_breakpoints"] + results_xbc15["num_breakpoints"] + results_xbc2["num_breakpoints"] if i is not None]
print(f"XBC: {num_samples_viridian_xbc} samples in Viridian, {num_samples_matches_xbc} matches")

XBC: 36 samples in Viridian, 21 matches


#### XBL (complex)

In [28]:
results_xbl3 = get_info_for(pango="XBL.3")
if results_xbl3["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbl3["num_breakpoints"])
results_xbl3["table"]

Samples in Viridian: 2
HMM results in matches: 2
Breakpoints:  [0]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10910492,XBL.3,1,0,,,,,9
1,ERR10937973,XBL.3,1,0,,,,,10


#### XBT (complex)

In [29]:
results_xbt = get_info_for(pango="XBT")
if results_xbt["num_breakpoints"] is not None:
    print("Breakpoints: ", results_xbt["num_breakpoints"])
results_xbt["table"]

Samples in Viridian: 1
HMM results in matches: 1
Breakpoints:  [2]


Unnamed: 0,sample,pango,num_parents,num_breakpoints,left_pango_parent,right_pango_parent,left_breakpoint,right_breakpoint,num_mutations
0,ERR10751378,XBT,3,2,BA.5.2.34,BA.2.75;BA.5.2.34,5184,9866;22600-22898,4


### Print to latex table

In [30]:
assert len(set(breakpoints_xbj)) == 1
assert len(set(results_xbp["num_breakpoints"])) == 1
assert len(set(results_xbs["num_breakpoints"])) == 1
assert len(set(results_xbw["num_breakpoints"])) == 1
assert len(set(results_xca["num_breakpoints"])) == 1
assert len(set(results_xak["num_breakpoints"])) == 1
assert len(set(results_xay["num_breakpoints"])) == 1
assert len(set(breakpoints_xbc)) == 1
assert len(set(results_xbl3["num_breakpoints"])) == 1
assert len(set(results_xbt["num_breakpoints"])) == 1

In [31]:
latex_tab = r'\begin{table}' + "\n" + \
r'\centering' + "\n" + \
r'\caption{' + "\n" + \
r'Sc2ts recombination detection results for Pango X lineages not added to the ARG.' + "\n" + \
r'}' + "\n" + \
r'\label{tab:absent_pango_x}' + "\n" + \
r'\begin{tabular}{crrr}' + "\n" + \
r'\toprule' + "\n" + \
r'Pango & Samples in Viridian & Samples passing QC & Breakpoints \\' + "\n" + \
r'\midrule' + "\n"

latex_tab += f"XBJ & {num_samples_viridian_xbj} & {num_samples_matches_xbj} & {breakpoints_xbj[0]}" + r'\\' + "\n"
latex_tab += f"XBP & {results_xbp["num_samples_viridian"]} & {results_xbp["num_samples_matches"]} & {results_xbp["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XBS & {results_xbs["num_samples_viridian"]} & {results_xbs["num_samples_matches"]} & {results_xbs["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XBW & {results_xbw["num_samples_viridian"]} & {results_xbw["num_samples_matches"]} & {results_xbw["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XCA & {results_xca["num_samples_viridian"]} & {results_xca["num_samples_matches"]} & {results_xca["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XAK & {results_xak["num_samples_viridian"]} & {results_xak["num_samples_matches"]} & {results_xak["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XAY & {num_samples_viridian_xay} & {num_samples_matches_xay} & {breakpoints_xay[0]}" + r'\\' + "\n"
latex_tab += f"XBC & {num_samples_viridian_xbc} & {num_samples_matches_xbc} & {breakpoints_xbc[0]}" + r'\\' + "\n"
latex_tab += f"XBL & {results_xbl3["num_samples_viridian"]} & {results_xbl3["num_samples_matches"]} & {results_xbl3["num_breakpoints"][0]}" + r'\\' + "\n"
latex_tab += f"XBT & {results_xbt["num_samples_viridian"]} & {results_xbt["num_samples_matches"]} & {results_xbt["num_breakpoints"][0]}" + r'\\' + "\n"

latex_tab += r'\bottomrule' + "\n" + \
r'\end{tabular}' + "\n" + \
r'\end{table}' + "\n"

print(latex_tab)

\begin{table}
\centering
\caption{
Sc2ts recombination detection results for Pango X lineages not added to the ARG.
}
\label{tab:absent_pango_x}
\begin{tabular}{crrr}
\toprule
Pango & Samples in Viridian & Samples passing QC & Breakpoints \\
\midrule
XBJ & 2 & 1 & 1\\
XBP & 8 & 2 & 1\\
XBS & 19 & 15 & 1\\
XBW & 1 & 1 & 1\\
XCA & 11 & 4 & 1\\
XAK & 2 & 2 & 1\\
XAY & 15 & 11 & 6\\
XBC & 36 & 21 & 3\\
XBL & 2 & 2 & 0\\
XBT & 1 & 1 & 2\\
\bottomrule
\end{tabular}
\end{table}

