### Number of Downloaded, Assembled, Binned, and Taxonomically Identified Cyanobacteria Isolates

In [14]:
# Adapted from Tony Liu's code
import plotly.graph_objects as go

def make_links(title: str, colours: list, vals: list[int], overwrites=None):
    print(title, vals)
    labels = f"{title}: {vals[0]}, Downloaded, Assembled, Binned, Classified".split(', ')
    foff = len(labels)
    fails = "Download failed, Assembly failed, Binning failed, Classification failed".split(', ')
    labels += fails

    if overwrites is not None:
        for i, new in overwrites:
            labels[i] = new

    links = []
    for i, v in enumerate(vals):
        if i == 0: continue
        links.append((i-1, i, v))
        fi = foff-1+i
        fv = vals[i-1]-v
        links.append((i-1, fi, fv))
        labels[i] += f': {v} ({round(v*1000/vals[0])/10}%)'

    src, lnk, val = [], [], []
    for tup in links:
        for lst, x in zip((src, lnk, val), tup):
            lst.append(x)
    
    return dict(
        source = src,
        target = lnk,
        value = val,
        color = colours[0],
        lab_colour = colours[1]
    ), [f"<b>{l}</b>" if i < foff else l for i, l in enumerate(labels)]

def sankey(links: list[tuple[dict, list]]):
    all_labels = []
    all_links = {}
    lab_colours = []
    for lnk, lab in links:
        offset = len(all_labels)
        all_labels += lab
        s, t, v, c = 'source, target, value, color'.split(', ')
        all_links[s] = all_links.get(s, [])+[x+offset for x in lnk[s]]
        all_links[t] = all_links.get(t, [])+[x+offset for x in lnk[t]]
        all_links[v] = all_links.get(v, [])+lnk[v]
        all_links[c] = all_links.get(c, [])+[lnk[c] for _ in lnk[v]]
        lab_colours += [lnk['lab_colour'] for _ in lab]

    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0),
            label = all_labels,
            color = lab_colours
        ),
        link = all_links
    )])
    fig.update_layout(
        autosize=False,
        width=2200,
        height=900,
        margin=dict(
            l=25,
            r=25,
            b=25,
            t=70,
            pad=4
        ),
        # paper_bgcolor="white",
        font_family="Times New Roman",
        font_color="black",
        font_size=27
    )
    fig.show()

sankey(
    [
        make_links(
            "SRA/Illumina Isolates",
            ["#A6CCFF", "#3374CC"],
            [
                144,
                142,
                139,
                138,
                137
            ]
        ),
        make_links(
            "SRA/Pacbio Isolates",
            ["#FFB5A6", "#E65639"],
            [
                61,
                61,
                60,
                52,
                52
            ]
        ),
    ]
)

SRA/Illumina Isolates [144, 142, 139, 138, 137]
SRA/Pacbio Isolates [61, 61, 60, 52, 52]


### Illumina vs Pacbio Cyanobacteria Isolates (NCBI, Processed, Thresholds filtered)

In [10]:
import os
import pandas as pd
import plotly.express as px

REPO = os.path.abspath('').removesuffix('Stats_Analyses')
dfsampleAna = pd.read_excel(REPO + 'Data/Processed.xlsx', sheet_name='PIsolateAna')
figsa = px.bar(dfsampleAna, x='Title', y=['Illumina isolates', 'Pacbio isolates'], color_discrete_map={
        'Illumina isolates': 'rgb(128,177,211)',
        'Pacbio isolates': '#FF9DA6'
    })
figsa.update_layout(xaxis_title=None) 
figsa.update_yaxes(title='The Number Of isolates')
figsa.show()

### Completion VS Contamination Distribution for Cyanobacterial and Non-cyanobacterial MAGs

In [15]:
# Adapted from Tony Liu's code
import plotly.subplots as sp

def dict_update(d1: dict, d2: dict, no_copy=False):
    if no_copy:
        d1.update(d2)
    else:
        d1 = d1.copy()
        d1.update(d2)
    return d1

df = pd.read_csv(REPO + 'Data/records.tsv', delimiter='\t')

def comp(df):
    return df['completeness']

def cont(df):
    return df['contamination']


cy = df[df['lineage']=='Cyanobacteria']
oth = df[df['lineage']!='Cyanobacteria']

df.tail()

Unnamed: 0,sample,bin,completeness,contamination,lineage
344,SRR6232652,bin.0,67.31,4.111,Cyanobacteria
345,SRR5830105,bin.0,90.93,0.109,Cyanobacteria
346,SRR6008331,bin.0,99.56,0.222,Cyanobacteria
347,SRR6232650,bin.1,68.19,4.718,Cyanobacteria
348,SRR6232660,bin.1,63.8,3.829,Cyanobacteria


In [16]:
# settings

axis_col = 'rgba(0, 0, 0, 0.15)'
no_col = 'rgba(0, 0, 0, 0)'
axis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=axis_col, zerolinewidth=1)
layout = dict(
    autosize=False,
    width=1300,
    height=650,
    margin=dict(
        l=25, r=25, b=25, t=25, pad=5
    ),
    # paper_bgcolor="white",
    font_family="Times New Roman",
    font_color="black",
    font_size=20,
    plot_bgcolor='white',
    xaxis=axis_desc,
    yaxis=axis_desc,
    xaxis2=axis_desc,
    yaxis2=axis_desc,
)

In [19]:
def make_traces():
    s, o = 7, 0.3
    return [
        go.Scatter(
            y=cont(oth),
            x=comp(oth),
            mode='markers',
            marker=dict(
                size=s,
                color='#FF5500',
                opacity=o
            ),
            name="Co-occurring Microorganisms"
        ),
        go.Scatter(
            y=cont(cy),
            x=comp(cy),
            mode='markers',
            marker=dict(
                size=s,
                color='#3ACC33',
                opacity=o
            ),
            name="Cyanobacteria"
        )
    ]
fig = sp.make_subplots(
    rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, horizontal_spacing=0.02,
    x_title="% Completeness"
)
for i, tr in enumerate(make_traces()):
    fig.add_trace(tr, row=1, col=i+1)
_layout = layout.copy()
_layout.update(dict(
    xaxis2=dict_update(axis_desc, dict(title="")),
    xaxis=dict_update(axis_desc, dict(title="")),
    yaxis=dict_update(axis_desc, dict(title="% Contamination")),
))

fig.update_annotations(font_size=24)
fig.update_layout(go.Layout(**_layout))
fig.show()

### Illumina vs Pacbio MAGs (Processed, Thresholds filtered)

In [12]:
dfbinAna = pd.read_excel(REPO + 'Data/Processed.xlsx', sheet_name='PMAGAna')
figba = px.bar(dfbinAna, x='Title', y=['MAGs from Illumina isolates', 'MAGs from Pacbio isolates'], color_discrete_map={
        'MAGs from Illumina isolates': '#FEAF16',
        'MAGs from Pacbio isolates': 'rgb(17, 165, 121)'
    })
figba.update_layout(xaxis_title=None) 
figba.update_yaxes(title='The Number Of MAGs')
figba.show()