In [1]:
import pandas as pd
import plotly.graph_objects as go

df = pd.read_csv("defined_articles.csv")

In [2]:
import plotly.express as px
import random

def pick_colour(name):
    color_wheel = px.colors.qualitative.Plotly
    return random.choice(color_wheel)

In [3]:
#creating a new dataframe to use in building the sankey chart

#create an empty dataframe
sankey_data = pd.DataFrame({'Source': [],
                   'Target': [],
                   'Value': [],
                   'Color': []})

organism_colouring = dict()
colour_counter = 0

#iterate over the rows of the dataframe
for index, row in df.iterrows():

    #get lists of substrates, products, and organisms
    substrates = row["Substrate"].split(", ")
    organisms = row["Organisms"].split(", ")
    products = row["Product"].split(", ")

    #adds colour to each organism using the pick_colour function
    for organism in organisms:
        if organism not in organism_colouring:
            organism_colouring[organism] = pick_colour(organism)

    #add links from each substrate to each organism
    for substrate in substrates:
        for organism in organisms:
            sankey_data.loc[len(sankey_data.index)] = [substrate, organism, 1, organism_colouring[organism]]

    #add links from each organism to each substrate
    for organism in organisms:
        for product in products:
            sankey_data.loc[len(sankey_data.index)] = [organism, product, 1, organism_colouring[organism]]

In [4]:
#needed modification for building the plot

#get each unique source_target and a mapping to their index
#unique_source_target = list(pd.unique(sankey_data[['Source', 'Target']].values.ravel('K')))
unique_source_target = list(pd.unique(sankey_data[['Source', 'Target', 'Color']].values.ravel('K')))
mapping_dict = {k: v for v, k in enumerate(unique_source_target)}
sankey_data['Source'] = sankey_data['Source'].map(mapping_dict)
sankey_data['Target'] = sankey_data['Target'].map(mapping_dict)
sankey_dict = sankey_data.to_dict(orient='list')

#setting colours for the nodes
node_colours = [organism_colouring.get(i, "grey") for i in unique_source_target]

In [5]:
#create diagram
import plotly.express as px

fig = go.Figure(data=[go.Sankey(
    orientation = "h",
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = unique_source_target,
      #color = "grey"
      color = node_colours
    ),
    link = dict(
      source = sankey_dict["Source"],
      target = sankey_dict["Target"],
      value = sankey_dict["Value"],
      color = sankey_dict["Color"]
      #color = [px.colors.qualitative.Plotly[unique_source_target.index(i) % len(px.colors.qualitative.Plotly)] for i in unique_source_target]
  ))])

layout = dict(
        title = "Sankey Diagram for consortia uses",
    height = 850,
    font = dict(
      size = 8),)

#show figure
fig.update_layout(layout)
fig.show()

In [11]:
#save the figure as a .svg file
fig.write_image("plots/sankey_diagram.png")

In [10]:
#new sankey chart for genus only

#creating a new dataframe to use in building the sankey chart

#create an empty dataframe
sankey_data = pd.DataFrame({'Source': [],
                   'Target': [],
                   'Value': [],
                   'Color': []})

organism_colouring = dict()
colour_counter = 0

#iterate over the rows of the dataframe
for index, row in df.iterrows():

    #get lists of substrates, products, and organisms
    substrates = row["Substrate"].split(", ")
    organisms = row["Organisms"].split(", ")
    products = row["Product"].split(", ")

    #taking the first word to get the genus instead of the organisms
    genuses = [organism.split(' ')[0] for organism in organisms]

    #adds colour to each organism using the pick_colour function
    for genus in genuses:
        if genus not in organism_colouring:
            organism_colouring[genus] = pick_colour(genus)

    #add links from each substrate to each organism
    for substrate in substrates:
        for genus in genuses:
            sankey_data.loc[len(sankey_data.index)] = [substrate, genus, 1, organism_colouring[genus]]

    #add links from each organism to each substrate
    for organism in organisms:
        for product in products:
            sankey_data.loc[len(sankey_data.index)] = [genus, product, 1, organism_colouring[genus]]


#needed modification for building the plot

#get each unique source_target and a mapping to their index
#unique_source_target = list(pd.unique(sankey_data[['Source', 'Target']].values.ravel('K')))
unique_source_target = list(pd.unique(sankey_data[['Source', 'Target', 'Color']].values.ravel('K')))
mapping_dict = {k: v for v, k in enumerate(unique_source_target)}
sankey_data['Source'] = sankey_data['Source'].map(mapping_dict)
sankey_data['Target'] = sankey_data['Target'].map(mapping_dict)
sankey_dict = sankey_data.to_dict(orient='list')

#setting colours for the nodes
node_colours = [organism_colouring.get(i, "grey") for i in unique_source_target]

#create diagram
import plotly.express as px

fig = go.Figure(data=[go.Sankey(
    orientation = "h",
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = unique_source_target,
      #color = "grey"
      color = node_colours
    ),
    link = dict(
      source = sankey_dict["Source"],
      target = sankey_dict["Target"],
      value = sankey_dict["Value"],
      color = sankey_dict["Color"]
      #color = [px.colors.qualitative.Plotly[unique_source_target.index(i) % len(px.colors.qualitative.Plotly)] for i in unique_source_target]
  ))])

layout = dict(
        title = "Sankey Diagram for consortia uses",
    height = 850,
    font = dict(
      size = 8),)

#show figure
fig.update_layout(layout)
fig.show()
fig.write_image("plots/sankey_diagram_genus.pdf")