### Comparison between NCBI Specified and GTDB-Tk Identified Cyanobacterial Taxonomies at Each Hierarchical Level

In [2]:
import os
import pandas as pd
import plotly.express as px
REPO = os.path.abspath('').removesuffix('Stats_Analyses')
dfncbiTax = pd.read_excel(REPO + 'Data/Obj2.xlsx', sheet_name='NCBITaxa')
ncbiTaxlist = dfncbiTax.values.tolist()
ncbiTax = {}

# Processing NCBI specified taxa
for encbitax in ncbiTaxlist:
    sra = encbitax[0]
    taxlevel = encbitax[1].split(";")
    sampletaxlist = {}
    for level in taxlevel:
        levelName = level.split(":")[0].strip()
        if (levelName != "Subclass" and levelName != "Strain"):
            sampletaxlist[levelName] = level.split(":")[1].strip()
    ncbiTax[sra] = sampletaxlist

# Processing gtdbtk identified taxa
dfgtdbtkTax = pd.read_excel(REPO + 'Data/Obj2.xlsx', sheet_name='gtdbtkTaxa')
gtdbtkTaxlist = dfgtdbtkTax.values.tolist()
gtdbtkTax = {}
for egtdbtktax in gtdbtkTaxlist:
    sra1 = egtdbtktax[0]
    sampletaxlist1 = {}
    if not pd.isnull(egtdbtktax[1]) and len(egtdbtktax[1].split("\n")) == 1:
        taxlevel1 = egtdbtktax[1].split(";")
        taxnamelist = ["Phylum", "Class", "Order", "Family", "Genus", "Species"]
        for i in range(1, 7):
            sampletaxlist1[taxnamelist[i-1]] = taxlevel1[i].strip()
        gtdbtkTax[sra1] = sampletaxlist1

## Comparing Order, Family, Genus, Species level taxa
## Phylum and Class excluded as they just used different nomenclature
ordertable = []
familytable = []
genustable = []
speciestable = []
for key in gtdbtkTax:
    if key in ncbiTax:
        gtdbtksampletax = gtdbtkTax[key]
        ncbisampletax = ncbiTax[key]
        for key1 in gtdbtksampletax:
            gtdbtkval = gtdbtksampletax[key1]
            if key1 not in ncbisampletax:
                ncbival = ""
            else:
                ncbival = ncbisampletax[key1]
            same = gtdbtkval == ncbival
            newlist = [key, ncbival, gtdbtkval, same]
            if (key1 == "Order"):
                ordertable.append(newlist)
            elif (key1 == "Family"):
                familytable.append(newlist)
            elif (key1 == "Genus"):
                genustable.append(newlist)
            elif (key1 == "Species"):
                speciestable.append(newlist)
                
dfOrder = pd.DataFrame(ordertable, columns =['SRA', 'NCBI_Order', "gtdbtk_Order", "SameOrNot"])
dfFamily = pd.DataFrame(familytable, columns =['SRA', 'NCBI_Family', "gtdbtk_Family", "SameOrNot"])
dfGenus = pd.DataFrame(genustable, columns =['SRA', 'NCBI_Genus', "gtdbtk_Genus", "SameOrNot"])
dfSpecies = pd.DataFrame(speciestable, columns =['SRA', 'NCBI_Species', "gtdbtk_Species", "SameOrNot"])

## The dataframes were transfered into excel sheets for further comparisons

### Distribution of Cyanobacteria with Identified and Specified Taxonomies First Diverged at Different Levels

In [24]:
dfdivAna = pd.read_excel(REPO + 'Data/Obj2.xlsx', sheet_name='DivAna')
figdiv = px.pie(dfdivAna, values='Number of Isolates', names='FirstDivergence', color_discrete_sequence=px.colors.sequential.Magenta)
figdiv.update_traces(sort=False) 
figdiv.show()

### Distribution of NCBI vs GTDB-Tk Cyanobacteria Taxonomies (Order Level)

In [5]:
odgtdbtklist = dfOrder["gtdbtk_Order"].values.tolist()
odNCBIlist = dfOrder["NCBI_Order"].values.tolist()
ordergtdbtk = {}
orderNCBI = {}
for odg in odgtdbtklist:
    if odg not in ordergtdbtk:
        ordergtdbtk[odg] = 1
    else:
        ordergtdbtk[odg] += 1
resultList = list(map(list, ordergtdbtk.items()))
dfgdtbtk = pd.DataFrame(resultList, columns=['Order', 'Distribution'])
dfgdtbtk.head()
figgtdbd = px.pie(dfgdtbtk, values='Distribution', names='Order')
figgtdbd.show()

In [6]:
for odN in odNCBIlist:
    if odN not in orderNCBI:
        orderNCBI[odN] = 1
    else:
        orderNCBI[odN] += 1
resultList2 = list(map(list, orderNCBI.items()))
dfNCBI = pd.DataFrame(resultList2, columns=['Order', 'Distribution'])
dfNCBI.head()
figNCBId = px.pie(dfNCBI, values='Distribution', names='Order')
figNCBId.show()

### Levels with Similar Taxonomies after Diverging from Order

In [14]:
dfordDif = pd.read_excel(REPO + 'Data/Obj2.xlsx', sheet_name='DivordconvAna')
figordDif = px.bar(dfordDif, x='Categorization of isolates where order not the same', y=['Number of Isolates'], color_discrete_map={
        'Number of Isolates': 'rgb(93,105,177)',
    })
figordDif.update_layout(xaxis_title=None) 
figordDif.update_yaxes(title='The Number Of Isolates')
figordDif.update_layout(showlegend=False)
figordDif.show()