In [1]:
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.transform import linear_cmap
import pandas as pd

In [2]:
x = []
y = []
cluster = []
mem_prob = []
name = []
accession = []


with open("shigella-sonnei_results.csv") as file:
    header = file.readline()
    print(header)
    for line in file:
        line = line.strip('\n').split(',')
        if line[0] == "-":
            continue
        x.append(float(line[0]))
        y.append(float(line[1]))
        cluster.append(int(line[2]))
        mem_prob.append(float(line[3]))
        name.append(str(line[4]))
        accession.append(str(line[4].split("_")[0]))
        
cluster_df = pd.DataFrame({
        'x' : x,
        'y' : y,
        'cluster' : cluster,
        'mem_prob' : mem_prob,
        'name' : name,
        'run_accession' : accession
})

tsne1D,tsne2D,Standard_Cluster,Membership_Probability,Sample_Name



In [3]:
metadata_df = pd.read_csv("/home/bayraktar/PycharmProjects/reconstruct_plasmids_snakemake/metadata.csv", sep=",")
metadata_selection = metadata_df[
    ["run_accession", "scientific_name", "strain", "inferred_collection_year", "inferred_source", "inferred_country",
     "inferred_city", "study_accession", "platform_parameters"]]

In [4]:
merged_df = pd.merge(cluster_df, metadata_selection, on="run_accession", how="inner")

In [5]:
source = ColumnDataSource(merged_df)

In [6]:
# source = ColumnDataSource(
#     data=dict(
#         x = x,
#         y = y,
#         cluster = cluster,
#         mem_prob = mem_prob,
#         name = name,
#         accession = accession
#     )
# )

In [7]:
output_notebook()

tooltips = HoverTool(tooltips=[
    ('X', '@x'),
    ('Y', '@y'),
    ('cluster', '@cluster'),
    ('mem_prob', '@mem_prob'),
    ('name', '@name'),
    ('run_accession', '@run_accession'),
    ('scientific_name', '@scientific_name'),
    ('strain', '@strain'),
    ('year', '@inferred_collection_year'),
    ('source', '@inferred_source'),
    ('country', '@inferred_country'),
    ('city', '@inferred_city'),
    ('study', '@study_accession'),
    ('platform', '@platform_parameters'),

])

color_mapper = linear_cmap(field_name='cluster', palette='Viridis256', low=min(cluster), high=max(cluster))


p = figure(width=700, height=700, tools=['pan', 'wheel_zoom', 'reset'], title="MGE-cluster clusters", x_axis_label='tsne1D', y_axis_label='tsne2D')
p.circle(x="x", y="y", size=10, alpha=0.5, source=source, color=color_mapper)
p.add_tools(tooltips)

show(p)

