In [5]:
# %%
import altair as alt
import pandas as pd

# Read in data
df = pd.read_csv("/Users/santiago/Downloads/all_batch_summeries (2).csv", usecols=range(1,14))

# Clean column names
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('[^a-zA-Z0-9_]', '', regex=True)
df.columns = df.columns.str.lower()

# Make field for mag id
df["mag_id"] = df["input_file"].str.split('.', expand=True)[0]
df["percent_gaps"] = df["percent_gaps"].str.split('%', expand=True)[0].map(float)

# Pivot long
df2 = pd.melt(
    df, 
    id_vars=[
        "sample_id", 
        "mag_id", 
        "dataset", 
        "n_markers", 
        'scaffold_n50', 
        'contigs_n50',
        'percent_gaps', 
        'number_of_scaffolds',
    ], 
    value_vars=[
        "single", "duplicated","fragmented","missing"
    ], 
    value_name="BUSCO_percentage", 
    var_name="category"
)

df3 = df[[
    "sample_id", 
    "mag_id",
    'scaffold_n50', 
    'contigs_n50',
    'percent_gaps', 
    'number_of_scaffolds',
]]

# Specify order
mapping = {"single":1, "duplicated":2,"fragmented":3,"missing":4}
df2["order"] = df2["category"].map(mapping)
df2["fracc_markers"] = "~" + round(df2["BUSCO_percentage"] * df2["n_markers"] / 100).map(int).map(str) + "/124"
df2


# %%
# Plot
domain = ['single', 'duplicated', 'fragmented', 'missing']
range_ = ['#1E90FF', '#87CEFA', '#FFA500', '#FF7F50']

output_plot = alt.Chart(df2).mark_bar().encode(
    x=alt.X(
        'sum(BUSCO_percentage)', 
        stack="normalize", 
        title="BUSCO fracc."
    ),
    y=alt.Y('mag_id', axis=alt.Axis(title='MAG ID')),
    color=alt.Color(
        'category', 
        scale=alt.Scale(domain=domain, range=range_), 
        legend=alt.Legend(title="BUSCO Category", orient="top")
    ),
    order=alt.Order('order', sort='ascending'),
    tooltip=[
        alt.Tooltip("sample_id", title="Sample ID"),
        alt.Tooltip("mag_id", title="MAG ID"),
        alt.Tooltip("dataset", title="Lineage dataset"),
        alt.Tooltip("fracc_markers", title="Aprox. number of markers in this category"),
        alt.Tooltip("BUSCO_percentage", title="Percentage [%]"),
    ],
    # column=alt.Column("scaffold_n50", title="Scaffold N50"),
    opacity=alt.value(0.85)).properties(width=600, height=18*9).facet(row=alt.Row("sample_id", title='Sample ID')).resolve_scale(y="independent")

# output_plot = output_plot.configure_axis(labelFontSize=17, titleFontSize=20)
# output_plot = output_plot.configure_legend(labelFontSize=17, titleFontSize=20)
# output_plot = output_plot.configure_header(labelFontSize=17, titleFontSize=20)
output_plot 

# %%
# Plot
domain = ['single', 'duplicated', 'fragmented', 'missing']
range_ = ['#1E90FF', '#87CEFA', '#FFA500', '#FF7F50']

# Drop down menu
dropdown = alt.binding_select(
    options=[
    'scaffold_n50', 
    'contigs_n50',
    'percent_gaps', 
    'number_of_scaffolds',
], 
    name="Assambly Statistics: "
)

xcol_param = alt.param(
    value='scaffold_n50',
    bind=dropdown
)

output_plot2 = alt.Chart(df3).mark_bar().encode(
    x=alt.X('x:Q').title('Assambly Statistic'),
    y=alt.Y('mag_id', axis=None),
    tooltip=[
        alt.Tooltip('x:Q', title=f"value"),
    ],
    opacity=alt.value(0.85)
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
).properties(
    width=600, 
    height=18*9
).facet(
    row=alt.Row("sample_id", title=None, header=alt.Header(labelFontSize=0))
).resolve_scale(
    y="independent"
)

# Combine the two charts side by side
out3 = alt.hconcat(output_plot, output_plot2, spacing=3)  # Adjust spacing as needed
out3 = out3.configure_axis(labelFontSize=17, titleFontSize=20)
out3 = out3.configure_legend(labelFontSize=17, titleFontSize=20)
out3 = out3.configure_header(labelFontSize=17, titleFontSize=20)
out3



In [3]:
df

Unnamed: 0,input_file,dataset,complete,single,duplicated,fragmented,missing,n_markers,scaffold_n50,contigs_n50,percent_gaps,number_of_scaffolds,sample_id,mag_id
0,67392c6c-9f45-4c84-85f5-ae0bfc668892.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,170295,170295,0.0,27,sample1,67392c6c-9f45-4c84-85f5-ae0bfc668892
1,67123d05-b5ae-4a53-873b-727952881899.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,109922,109922,0.0,65,sample1,67123d05-b5ae-4a53-873b-727952881899
2,311112c9-7f8b-460c-9cad-3864af3148c2.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,111744,111744,0.0,127,sample1,311112c9-7f8b-460c-9cad-3864af3148c2
3,fa0a025e-3fac-4e6b-8736-69a38c33b3f5.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114741,114741,0.0,63,sample2,fa0a025e-3fac-4e6b-8736-69a38c33b3f5
4,12f0542c-8375-4ec2-b25a-515962533a7a.fasta,bacteria_odb10,96.0,95.2,0.8,0.8,3.2,124,171535,171535,0.0,26,sample2,12f0542c-8375-4ec2-b25a-515962533a7a
5,971335f0-18b9-422e-81a7-5862c49b1e3d.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,104568,104568,0.0,127,sample2,971335f0-18b9-422e-81a7-5862c49b1e3d
6,040d5cd4-4230-4533-a0a9-f03e3263c338.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,89358,89358,0.0,132,sample3,040d5cd4-4230-4533-a0a9-f03e3263c338
7,e08a46f3-6fbe-4415-b10f-f11d0d187d17.fasta,bacteria_odb10,97.6,96.0,1.6,1.6,0.8,124,114888,114888,0.0,60,sample3,e08a46f3-6fbe-4415-b10f-f11d0d187d17
8,e24225c7-6455-49b1-ad8d-86f95dfbfa2e.fasta,bacteria_odb10,97.6,96.8,0.8,0.8,1.6,124,171506,171506,0.0,27,sample3,e24225c7-6455-49b1-ad8d-86f95dfbfa2e
