# Imports

In [1]:
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.transform import linear_cmap
import pandas as pd
import wrangling_funcs

# Reading MGE-cluster results

In [2]:
x = []
y = []
cluster = []
mem_prob = []
name = []
accession = []

with open("shigella-sonnei_results.csv") as file:
    header = file.readline()
    print(header)
    for line in file:
        line = line.strip('\n').split(',')
        if line[0] == "-":
            continue
        x.append(float(line[0]))
        y.append(float(line[1]))
        cluster.append(int(line[2]))
        mem_prob.append(float(line[3]))
        name.append(str(line[4]))
        accession.append(str(line[4].split("_")[0]))
        
cluster_df = pd.DataFrame({
        'x' : x,
        'y' : y,
        'cluster' : cluster,
        'mem_prob' : mem_prob,
        'name' : name,
        'run_accession' : accession
})

tsne1D,tsne2D,Standard_Cluster,Membership_Probability,Sample_Name



In [3]:
cluster_df

Unnamed: 0,x,y,cluster,mem_prob,name,run_accession
0,15.277864,9.125345,16,1.000000,ERR10074377_bin_1,ERR10074377
1,-12.562531,6.045441,4,0.538177,ERR10074377_bin_2,ERR10074377
2,28.140576,-20.610980,5,1.000000,ERR10074377_bin_Isolated_1,ERR10074377
3,-15.224413,11.998804,-1,0.000000,ERR10074377_bin_Unbinned,ERR10074377
4,7.700862,0.403415,25,0.638178,ERR10074378_bin_1,ERR10074378
...,...,...,...,...,...,...
2345,24.678694,-21.191344,5,1.000000,ERR11597012_bin_4,ERR11597012
2346,-3.456516,24.190710,3,0.997902,ERR11597012_bin_5,ERR11597012
2347,1.578662,33.310482,2,1.000000,ERR11597012_bin_6,ERR11597012
2348,0.233269,-32.905337,21,0.340918,ERR11597012_bin_7,ERR11597012


# Reading metadata

In [4]:
metadata_df = pd.read_csv("/home/bayraktar/PycharmProjects/reconstruct_plasmids_snakemake/metadata.csv", sep=",")
metadata_selection = metadata_df[
    ["run_accession", "scientific_name", "strain", "inferred_collection_year", "inferred_source", "inferred_country",
     "inferred_city", "study_accession", "platform_parameters"]]

In [5]:
metadata_selection

Unnamed: 0,run_accession,scientific_name,strain,inferred_collection_year,inferred_source,inferred_country,inferred_city,study_accession,platform_parameters
0,ERR10074420,Shigella sonnei,,2012,Stool culture,South Africa,Gauteng,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
1,ERR10074650,Shigella sonnei,,2015,Stool culture,South Africa,Gauteng,ERP140054,INSTRUMENT_MODEL: Illumina HiSeq 4000
2,ERR10074714,Shigella sonnei,,2013,Stool culture,South Africa,Western Cape,ERP140054,INSTRUMENT_MODEL: Illumina HiSeq 4000
3,ERR10074515,Shigella sonnei,,2014,Stool culture,South Africa,KwaZulu-Natal,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
4,ERR10074523,Shigella sonnei,,2014,Stool culture,South Africa,Gauteng,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
...,...,...,...,...,...,...,...,...,...
321,ERR11596999,Shigella sonnei,CL-042,2010,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
322,ERR11596994,Shigella sonnei,CL-037,2012,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
323,ERR11596983,Shigella sonnei,CL-024,2012,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
324,ERR11596972,Shigella sonnei,CL-012,2011,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500


# Merge results with metadata

In [6]:
df = pd.merge(cluster_df, metadata_selection, on="run_accession", how="inner")

In [7]:
df

Unnamed: 0,x,y,cluster,mem_prob,name,run_accession,scientific_name,strain,inferred_collection_year,inferred_source,inferred_country,inferred_city,study_accession,platform_parameters
0,15.277864,9.125345,16,1.000000,ERR10074377_bin_1,ERR10074377,Shigella sonnei,,2015,Stool culture,South Africa,Western Cape,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
1,-12.562531,6.045441,4,0.538177,ERR10074377_bin_2,ERR10074377,Shigella sonnei,,2015,Stool culture,South Africa,Western Cape,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
2,28.140576,-20.610980,5,1.000000,ERR10074377_bin_Isolated_1,ERR10074377,Shigella sonnei,,2015,Stool culture,South Africa,Western Cape,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
3,-15.224413,11.998804,-1,0.000000,ERR10074377_bin_Unbinned,ERR10074377,Shigella sonnei,,2015,Stool culture,South Africa,Western Cape,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
4,7.700862,0.403415,25,0.638178,ERR10074378_bin_1,ERR10074378,Shigella sonnei,,2013,Stool culture,South Africa,Gauteng,ERP140054,INSTRUMENT_MODEL: Illumina NovaSeq 6000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345,24.678694,-21.191344,5,1.000000,ERR11597012_bin_4,ERR11597012,Shigella sonnei,Cl-059,2016,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
2346,-3.456516,24.190710,3,0.997902,ERR11597012_bin_5,ERR11597012,Shigella sonnei,Cl-059,2016,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
2347,1.578662,33.310482,2,1.000000,ERR11597012_bin_6,ERR11597012,Shigella sonnei,Cl-059,2016,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500
2348,0.233269,-32.905337,21,0.340918,ERR11597012_bin_7,ERR11597012,Shigella sonnei,Cl-059,2016,stool,Lebanon,,ERP148544,INSTRUMENT_MODEL: NextSeq 500


# Plotting

In [8]:
by_country = ( 
    df.groupby("inferred_country").inferred_source.value_counts().unstack().reset_index()
)

In [9]:
by_country

inferred_source,inferred_country,Blood culture,Stool culture,Urine culture,feces,stool
0,Belgium,,,,531.0,
1,Colombia,,,,13.0,
2,Lebanon,,,,,170.0
3,South Africa,12.0,1618.0,6.0,,


In [10]:
stool_df = wrangling_funcs.combine_columns(by_country, ["Stool culture", "feces", "stool"], 'feces_cmb')
stool_df.drop(["Stool culture", "feces", "stool"], inplace=True, axis=1)

In [11]:
stool_df
# if a country has NaN values for any column it will not plot the bar for it

inferred_source,inferred_country,Blood culture,Urine culture,feces_cmb
0,Belgium,,,531.0
1,Colombia,,,13.0
2,Lebanon,,,170.0
3,South Africa,12.0,6.0,1618.0


In [12]:
stool_df.fillna(int(0), inplace=True)
stool_df

inferred_source,inferred_country,Blood culture,Urine culture,feces_cmb
0,Belgium,0.0,0.0,531.0
1,Colombia,0.0,0.0,13.0
2,Lebanon,0.0,0.0,170.0
3,South Africa,12.0,6.0,1618.0


In [14]:
output_notebook()

p = figure(
    title="",
    height=600,
    width=700,
    x_range=stool_df["inferred_country"],
)

# create stacked bars
p.vbar_stack(
    ["Blood culture", "Urine culture", "feces_cmb"],  # column names of bars to stack
    x="inferred_country",  # column name for categorical x-axis
    source=stool_df,  # data source for column names
    width=0.9,  # bar width
    line_width=2.5,  # increased line width to separate the bars
    line_color="white",  # line color separating the bars
    color=["#0000CC", "#CC6600", "red"],  # colors for each bar stack
    legend_label=["Blood culture", "Urine culture", "feces_cmb"],
)

# plot customization

# remove x-axis line and ticks
p.xaxis.axis_line_color = None
p.xaxis.axis_line_width = 0
p.xaxis.major_tick_out = 0

# remove plot grid lines and outline
p.grid.grid_line_color = None
p.outline_line_color = None

# position legend location and orientation
p.legend.location = "top_left"
p.legend.orientation = "vertical"

show(p)