<a href="https://colab.research.google.com/github/aglucaci/Sankey_Kraken2/blob/main/CreateSankeyForKraken2Output.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gather Data

In [1]:
!wget https://raw.githubusercontent.com/aglucaci/Sankey_Kraken2/refs/heads/main/kreport.tsv

--2025-05-29 19:21:22--  https://raw.githubusercontent.com/aglucaci/Sankey_Kraken2/refs/heads/main/kreport.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1032303 (1008K) [text/plain]
Saving to: ‘kreport.tsv’


2025-05-29 19:21:22 (9.74 MB/s) - ‘kreport.tsv’ saved [1032303/1032303]



# Visualize

In [2]:
import pandas as pd
import plotly.graph_objects as go

In [3]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [9]:
kreport_path = "kreport.tsv"

# Read TSV file
df = pd.read_csv(kreport_path, sep='\t', header=None,
                 names=["percent", "reads_clade", "reads_direct", "rank", "ncbi_tax_id", "indented_name"])

In [6]:
df

Unnamed: 0,percent,reads_clade,reads_direct,rank,ncbi_tax_id,indented_name
0,4.28,1136204,1136204,U,0,unclassified
1,95.72,25411548,4530,R,1,root
2,95.70,25405878,555556,R1,131567,cellular organisms
3,93.45,24809864,500540,D,2,Bacteria
4,87.71,23285181,198281,K,3379134,Pseudomonadati
...,...,...,...,...,...,...
15888,0.00,1,0,F,10811,Geminiviridae
15889,0.00,1,0,G,10814,Begomovirus
15890,0.00,1,1,S,371401,Siegesbeckia yellow vein virus
15891,0.00,1,0,R1,2787854,other entries


In [10]:
#import pandas as pd
##import plotly.graph_objects as go

# === Step 1: Load kreport.tsv ===
#df = pd.read_csv("kreport.tsv", sep="\t", header=None,
#                 names=["percent", "reads_clade", "reads_direct", "rank", "taxid", "indented_name"])

# === Step 2: Extract taxonomic level depth and clean names ===
def parse_depth_and_name(indented_name):
    stripped = indented_name.lstrip()
    depth = (len(indented_name) - len(stripped)) // 2  # Kraken indents by 2 spaces
    return pd.Series([depth, stripped])
# end for

df[['depth', 'name']] = df['indented_name'].apply(parse_depth_and_name)

# === Step 3: Filter levels and remove zero-read entries ===
df = df[df['reads_clade'] > 100000].copy()
df.reset_index(drop=True, inplace=True)

# === Step 4: Build node list and Sankey links ===
node_labels = []
node_map = {}  # (depth, name) -> index
sources = []
targets = []
values = []

# Keep track of previous node at each depth
parent_node = {}

for idx, row in df.iterrows():
    node_key = (row['depth'], row['name'])
    if node_key not in node_map:
        node_map[node_key] = len(node_labels)
        node_labels.append(row['name'])

    current_index = node_map[node_key]

    if row['depth'] > 0 and (row['depth'] - 1) in parent_node:
        parent_key = parent_node[row['depth'] - 1]
        parent_index = node_map[parent_key]

        sources.append(parent_index)
        targets.append(current_index)
        values.append(row['reads_clade'])

    parent_node[row['depth']] = node_key
# end for

# === Step 5: Plot the Sankey ===
fig = go.Figure(data=[go.Sankey(
    arrangement="snap",
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
)])

# Increase overall figure size
fig.update_layout(
    title_text="Kraken Read Classification Sankey",
    font_size=4,
    width=1400*3,
    height=800*4,  # adjust height to allow vertical taxonomic levels
    margin=dict(l=50,
                r=50,
                t=50,
                b=50)
)

fig.update_layout(title_text="Kraken Read Classification Sankey", font_size=18)
#fig.show()

#fig.write_html("kraken_sankey.html")
fig.write_image("kraken_sankey.svg")
fig.write_image("kraken_sankey.png")