In [11]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

sns.set_context("paper")
import matplotlib.ticker as ticker
import matplotlib.patches as mpatches

from tqdm import tqdm # progress bars :)
tqdm.pandas()

from sourmash.lca import lca_utils
from sourmash.tax import tax_utils

In [4]:
a85_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvani_85.txt"
a90_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvani_90.txt"
a95_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvani_95.txt"

c85_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvcont_85.txt"
c90_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvcont_90.txt"
c95_file = "big_file_cont_ani_85-90-95/gtdb-rs202.nucleotide-k31-scaled1000.recalc-ani.cut.csvcont_95.txt"

In [62]:
#a95_clusters = [x.strip().split(',') for x in open(a95_file, 'r')]
#a95_clusters[1]
a95 = pd.read_csv(a95_file, sep='\t', header=None, index_col=False, names = ["cluster_idents"])
a95_head = a95.head()
a95_head

Unnamed: 0,cluster_idents
0,"GCA_000006155,GCF_002565765,GCF_001941885,GCF_..."
1,"GCF_000178895,GCA_000007325,GCA_001296185,GCF_..."
2,"GCF_003299955,GCA_000007385,GCF_003300055,GCF_..."
3,GCA_000008085
4,"GCA_002554195,GCA_000009845,GCF_000803325,GCF_..."


In [38]:
# read in lineages
taxonomy_csv = "gtdb-rs202.taxonomy.v2.csv"
tax = pd.read_csv(taxonomy_csv)
tax['lineage'] = tax["superkingdom"] + ',' + tax["phylum"] + ',' + tax["class"] + ',' + tax["order"] + ',' + tax["family"] + ',' + tax["genus"] + ',' + tax["species"]
tax['smash_lin'] = tax['lineage'].apply(lambda x: lca_utils.make_lineage(x))
tax['split_ident'] = tax['ident'].str.split('.', expand=True)[0]
tax.head()

Unnamed: 0,ident,superkingdom,phylum,class,order,family,genus,species,lineage,smash_lin,split_ident
0,GCF_014075335.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia flexneri,"d__Bacteria,p__Proteobacteria,c__Gammaproteoba...","((superkingdom, d__Bacteria), (phylum, p__Prot...",GCF_014075335
1,GCF_002310555.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia flexneri,"d__Bacteria,p__Proteobacteria,c__Gammaproteoba...","((superkingdom, d__Bacteria), (phylum, p__Prot...",GCF_002310555
2,GCF_900013275.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia flexneri,"d__Bacteria,p__Proteobacteria,c__Gammaproteoba...","((superkingdom, d__Bacteria), (phylum, p__Prot...",GCF_900013275
3,GCF_000168095.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia flexneri,"d__Bacteria,p__Proteobacteria,c__Gammaproteoba...","((superkingdom, d__Bacteria), (phylum, p__Prot...",GCF_000168095
4,GCF_002459845.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia flexneri,"d__Bacteria,p__Proteobacteria,c__Gammaproteoba...","((superkingdom, d__Bacteria), (phylum, p__Prot...",GCF_002459845


In [39]:
taxD = tax.set_index('split_ident').to_dict()['smash_lin']

In [67]:
def count_and_find_lca_test(row, lineages=taxD):
    all_idents = row['cluster_idents']
    ident_list = all_idents.split(',')
    row['cluster_len'] = len(ident_list)
    all_lineages=[]
    for ident in ident_list:
        lineage = taxD[ident]
        all_lineages.append(lineage)
    lca_tree = lca_utils.build_tree(all_lineages)
    lca = lca_utils.find_lca(lca_tree)
    row['cluster_lca'] = lca
    if lca[1] > 0:
        print("LIN:", lca[0])
        row['cluster_lca_pretty'] = 'multiple'
    else:
        row['cluster_lca_pretty'] = lca_utils.display_lineage(lca[0])
        row['lca_rank'] = lca[0][-1].rank
    print(lca[0])
    print(lca[1])
    print(row['cluster_lca_pretty'])
    return row

In [68]:
a95_head.progress_apply(count_and_find_lca_test, axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 318.21it/s]

LIN: (LineagePair(rank='superkingdom', name='d__Bacteria'), LineagePair(rank='phylum', name='p__Firmicutes'), LineagePair(rank='class', name='c__Bacilli'), LineagePair(rank='order', name='o__Bacillales'), LineagePair(rank='family', name='f__Bacillaceae_G'), LineagePair(rank='genus', name='g__Bacillus_A'))
(LineagePair(rank='superkingdom', name='d__Bacteria'), LineagePair(rank='phylum', name='p__Firmicutes'), LineagePair(rank='class', name='c__Bacilli'), LineagePair(rank='order', name='o__Bacillales'), LineagePair(rank='family', name='f__Bacillaceae_G'), LineagePair(rank='genus', name='g__Bacillus_A'))
12
multiple
(LineagePair(rank='superkingdom', name='d__Bacteria'), LineagePair(rank='phylum', name='p__Fusobacteriota'), LineagePair(rank='class', name='c__Fusobacteriia'), LineagePair(rank='order', name='o__Fusobacteriales'), LineagePair(rank='family', name='f__Fusobacteriaceae'), LineagePair(rank='genus', name='g__Fusobacterium'), LineagePair(rank='species', name='s__Fusobacterium nucle




Unnamed: 0,cluster_idents,cluster_len,cluster_lca,cluster_lca_pretty
0,"GCA_000006155,GCF_002565765,GCF_001941885,GCF_...",961,"(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple
1,"GCF_000178895,GCA_000007325,GCA_001296185,GCF_...",12,"(((superkingdom, d__Bacteria), (phylum, p__Fus...",d__Bacteria;p__Fusobacteriota;c__Fusobacteriia...
2,"GCF_003299955,GCA_000007385,GCF_003300055,GCF_...",360,"(((superkingdom, d__Bacteria), (phylum, p__Pro...",d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
3,GCA_000008085,1,"(((superkingdom, d__Archaea), (phylum, p__Nano...",d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o_...
4,"GCA_002554195,GCA_000009845,GCF_000803325,GCF_...",11,"(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple


In [77]:
def count_and_find_lca(row, lineages=taxD):
    all_idents = row['cluster_idents']
    ident_list = all_idents.split(',')
    row['cluster_len'] = len(ident_list)
    all_lineages=[]
    for ident in ident_list:
        lineage = taxD[ident]
        all_lineages.append(lineage)
    lca_tree = lca_utils.build_tree(all_lineages)
    lca = lca_utils.find_lca(lca_tree)
    row['cluster_lca'] = lca
    if lca[1] > 0:
        row['cluster_lca_pretty'] = 'multiple'
        row['lca_rank'] = np.nan
    else:
        row['cluster_lca_pretty'] = lca_utils.display_lineage(lca[0])
        row['lca_rank'] = lca[0][-1].rank
    return row

In [78]:
a95 = a95.progress_apply(count_and_find_lca, axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43805/43805 [00:30<00:00, 1424.11it/s]


In [82]:
a95.shape

(43805, 5)

In [79]:
a95.head()

Unnamed: 0,cluster_idents,cluster_lca,cluster_lca_pretty,cluster_len,lca_rank
0,"GCA_000006155,GCF_002565765,GCF_001941885,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,961,
1,"GCF_000178895,GCA_000007325,GCA_001296185,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fus...",d__Bacteria;p__Fusobacteriota;c__Fusobacteriia...,12,species
2,"GCF_003299955,GCA_000007385,GCF_003300055,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Pro...",d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,360,species
3,GCA_000008085,"(((superkingdom, d__Archaea), (phylum, p__Nano...",d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o_...,1,species
4,"GCA_002554195,GCA_000009845,GCF_000803325,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,11,


In [80]:
a95[a95['cluster_lca_pretty'] == 'multiple']

Unnamed: 0,cluster_idents,cluster_lca,cluster_lca_pretty,cluster_len,lca_rank
0,"GCA_000006155,GCF_002565765,GCF_001941885,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,961,
4,"GCA_002554195,GCA_000009845,GCF_000803325,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,11,
6,"GCF_900489705,GCF_900489715,GCF_900489725,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,76,
11,"GCF_900052225,GCF_900052235,GCA_000014325,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,1308,
12,"GCA_000015425,GCA_000069205,GCA_000184515,GCA_...","(((superkingdom, d__Bacteria), (phylum, p__Pro...",multiple,5006,
...,...,...,...,...,...
43407,"GCF_900772435,GCF_900768345","(((superkingdom, d__Bacteria), (phylum, p__Fir...",multiple,2,
43553,"GCF_902158745,GCF_902158735","(((superkingdom, d__Archaea), (phylum, p__Halo...",multiple,2,
43630,"GCF_902535955,GCF_902565895","(((superkingdom, d__Bacteria), (phylum, p__Pro...",multiple,2,
43667,"GCF_902705845,GCF_902705865,GCF_902705835","(((superkingdom, d__Bacteria), (phylum, p__Pro...",multiple,3,


In [83]:
a95[a95['lca_rank'] == 'species']

Unnamed: 0,cluster_idents,cluster_lca,cluster_lca_pretty,cluster_len,lca_rank
1,"GCF_000178895,GCA_000007325,GCA_001296185,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fus...",d__Bacteria;p__Fusobacteriota;c__Fusobacteriia...,12,species
2,"GCF_003299955,GCA_000007385,GCF_003300055,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Pro...",d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,360,species
3,GCA_000008085,"(((superkingdom, d__Archaea), (phylum, p__Nano...",d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o_...,1,species
5,"GCA_013178385,GCA_000010565","(((superkingdom, d__Bacteria), (phylum, p__Fir...",d__Bacteria;p__Firmicutes_B;c__Desulfotomaculi...,2,species
7,"GCA_000013525,GCF_014050235,GCA_000167435,GCF_...","(((superkingdom, d__Bacteria), (phylum, p__Fir...",d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,2136,species
...,...,...,...,...,...
43800,"GCF_903986915,GCF_903986855","(((superkingdom, d__Bacteria), (phylum, p__Pro...",d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,2,species
43801,GCF_903994035,"(((superkingdom, d__Bacteria), (phylum, p__Fir...",d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphy...,1,species
43802,GCF_903994045,"(((superkingdom, d__Bacteria), (phylum, p__Fir...",d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphy...,1,species
43803,GCF_904061905,"(((superkingdom, d__Bacteria), (phylum, p__Pro...",d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,1,species
