In [93]:
import pandas as pd
import duckdb
import hvplot.pandas
from deltalake import DeltaTable

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
pio.renderers.default = "nteract"

import matplotlib.colors


In [94]:
df = pd.read_parquet('./data/ridership/passengers_daily.parquet')
df_sankey = duckdb.sql("""
            SELECT station_name_origin, station_name_destination, sum(passengers) as passengers
           FROM df
           GROUP BY 1,2
           """).df()

df_sankey_top10 = df_sankey.sort_values('passengers', ascending=False)[:20]
df_sankey_top10.head()

Unnamed: 0,station_name_origin,station_name_destination,passengers
11573,Tun Razak Exchange (origin),Bukit Bintang (destination),191976.0
8382,KLCC (origin),Ampang Park (destination),147679.0
7252,Bukit Bintang (origin),Tun Razak Exchange (destination),147415.0
2372,Ampang Park (origin),KLCC (destination),125545.0
12069,KL Sentral (origin),KLCC (destination),121270.0


In [95]:
df_sankey_top10 = df_sankey.sort_values('passengers', ascending=False)[:20]

def prepare_sankey(dataframe:pd.DataFrame, source:pd.Series, target:pd.Series, opacity:float = 0.5, color_tone = px.colors.qualitative.G10):
    """
    This function will convert a dataframe into readable format for Plotly Sankey chart
    Sample table input:

    |  source  |  target  | value  |  
    |------------------------------|
    |    a1     |    b1     |   1  |
    |------------------------------|
    |    a2     |    b1     |   3  |
    |------------------------------|
    
    color_tone:
        # https://plotly.com/python/discrete-color/
        fig = px.colors.qualitative.swatches()
        fig.show()

    """
    source = source.name
    target = target.name

    data = dataframe.copy()

    unique_source_target = list(pd.unique(data[[source, target]].values.ravel('K')))

    # Define a list of hex color codes for nodes
    node_colors = color_tone

     # Create var to hold node/label and link colors
    label_colors, data["link_colors"] = [], str(0)

    # Convert list of colors to RGB format to override default gray LINK colors
    colors = [matplotlib.colors.to_rgb(i) for i in node_colors]  

    # Loop through all the labels to specify color and to use label indices
    c, max_colors = 0, len(colors)  # To loop through the colors array
    for l in range(len(unique_source_target)):
        label_colors.append(colors[c])
        link_color = colors[c] + (opacity,)  # Make link more transparent than the node
        data.loc[data[target] == unique_source_target[l], "link_colors"] = "rgba" + str(link_color)
        # data.loc[data[source] == unique_source_target[l], "link_colors"] = "rgba" + str(link_color) # uncomment this line if want the LINK color to follow source node color
        if c == max_colors - 1:
            c = 0
        else:
            c += 1

    # Convert colors into RGB string format for Plotly
    label_colors = ["rgb" + str(i) for i in label_colors]

    #for assigning unique number to each source and target
    mapping_dict = {k: v for v, k in enumerate(unique_source_target)}

    #mapping of full data
    data[source] = data[source].map(mapping_dict)
    data[target] = data[target].map(mapping_dict)

    links_dict = data.to_dict(orient='list')

    return links_dict, unique_source_target, label_colors


links, unique_source_target, label_colors = prepare_sankey(dataframe=df_sankey_top10,
                       source=df_sankey_top10['station_name_origin'],
                       target=df_sankey_top10['station_name_destination']
                       )

In [96]:
#Sankey Diagram Code 
fig = go.Figure(data=[go.Sankey(
    arrangement = "snap",
    node = dict(
      pad = 15,
      thickness = 15,
      line = dict(color = "black", width = 0.7),
      label = unique_source_target,
      color = label_colors
      
    ),
    link = dict(
      arrowlen=35,
      source = links["station_name_origin"],
      target = links["station_name_destination"],
      value = links["passengers"]
      ,color = links["link_colors"]
  
  ))])

fig.update_layout(title_text="Train Passengers To-From Stations",
                  autosize=False,
                  width=750,
                  height=1000,)

# How to color the links: https://plotly.com/python/sankey-diagram/ 
fig.show()

## Using HoloViz

In [97]:
import holoviews as hv
from holoviews import opts

hv.extension('bokeh')

In [98]:
def fmt(tick):
    if tick < 1e3:
        unit = ''
        num =  round(tick,2)
    elif tick < 1e6:
        unit = 'k'
        num =  round(tick/1e3,2)
    else:
        unit = 'm'
        num =  round(tick/1e6,2)
    return f'{num}{unit}'



edges = df_sankey_top10
edges.columns = ['source', 'target', 'value']
sankey = hv.Sankey(data=edges, kdims=['source', 'target'], 
                   vdims=[hv.Dimension("value", value_format=fmt)], 
                   label='Rail Diagram')

# Options: https://holoviews.org/reference_manual/holoviews.plotting.bokeh.html
sankey.opts(label_position='left', edge_color='target', node_color='index', cmap='tab20', width = 1200, height= 1000, node_width = 30, fontsize= 14)
# sankey.opts(fontsize={'title': 35, 'label_text_font_size': 35})

In [99]:
hv.renderer('bokeh').save(sankey, 'rrr', fmt='html')
# kena save as html dulu, open in browser then download the pic

# ['html', 'auto', 'png', 'widgets', 'scrubber', 'gif', 'auto', None]