## Homework 5 - Alessandro Leggio Yepez
Sankey Plot

1. Look at the [chapter on interactive graphics](https://smart-stats.github.io/ds4bio_book/book/_build/html/interactive.html) and, specifically, the code to display a subject's MRICloud data as a sunburst plot. Do the following. Display this subject's data as a [Sankey diagram](https://plotly.com/python/sankey-diagram/). Display as many levels as you can (at least 3) for Type = 1, starting from the intracranial volume. Put this in a file called hw4.ipynb.

In [34]:
#import libraries
import pandas as pd
import plotly.express as px
import numpy as np

## load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
data = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1) #read in csv
#rename columns
data = data.rename(columns = { 
    "modify"   : "roi", 
    "modify.1" : "level4",
    "modify.2" : "level3", 
    "modify.3" : "level2",
    "modify.4" : "level1"})
data = data[['roi', 'level4', 'level3', 'level2', 'level1']] #set column order

## Now load in the subject data
id = 127 #subject id
subject = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv") #read csv
subject = subject.loc[(subject.type == 1) & (subject.level == 5) & (subject.id == id)] #segregate type 1 data for that subhect
subject = subject[['roi', 'volume']] #only use roi and volume
## Merge the subject data with the heirarchy
subject = pd.merge(subject, data, on = "roi") #left merge dataset
subject = subject.assign(icv = "ICV") #assign new value as ICV
subject = subject.assign(comp = subject.volume / np.sum(subject.volume)) #get composition
subject

Unnamed: 0,roi,volume,level4,level3,level2,level1,icv,comp
0,SFG_L,12926,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.009350
1,SFG_R,10050,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R,ICV,0.007270
2,SFG_PFC_L,12783,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.009247
3,SFG_PFC_R,11507,SFG_R,Frontal_R,CerebralCortex_R,Telencephalon_R,ICV,0.008324
4,SFG_pole_L,3078,SFG_L,Frontal_L,CerebralCortex_L,Telencephalon_L,ICV,0.002227
...,...,...,...,...,...,...,...,...
275,Chroid_LVetc_L,444,AnteriorLateralVentricle_L,LateralVentricle_L,Ventricle,CSF,ICV,0.000321
276,Chroid_LVetc_R,371,AnteriorLateralVentricle_R,LateralVentricle_R,Ventricle,CSF,ICV,0.000268
277,IV_ventricle,2700,IV_ventricle,IV_ventricle,Ventricle,CSF,ICV,0.001953
278,ECCL_L,292,inf_DPWM_L,InferiorWM_L,WhiteMatter_L,Telencephalon_L,ICV,0.000211


In [37]:
#import libraries
import numpy as np

#generate sankey chart data first!

#define function that takes in dataframe, columns for labels/nodes/targets, and composition as line weight

def gensankey(df: pd.DataFrame, columns: list, sankey_weight: str):

    # column values are nodes
    column_vals = [df[col] for col in columns]

    # labels are unique values of columns
    labels = sum([list(pd.unique(node_vals)) for node_vals in column_vals],[])

    # initializes a dict of dicts
    link_mappings = {col: {} for col in columns}

    # each dict maps a node to unique number value
    i = 0 #set iteration start
    for col, nodes in zip(columns, column_vals): #for every column and node
        for node in nodes.unique(): #look at unique nodes
            link_mappings[col][node] = i #map column and node to i
            i = i + 1 #aggregate i

    # specifying which coluns are serving as sources and which as targets
    source_node = column_vals[: len(columns) - 1] #sources are all rows in every column minus the last one
    target_node = column_vals[1:] #targets are columns besides the first one
    #same as above but with columns themselves
    source_cols = columns[: len(columns) - 1] 
    target_cols = columns[1:]
    links = []

    # loop to create a list of links
    for source, target, source_col, target_col in zip(source_node, target_node, source_cols, target_cols):
        for v1, v2, link_weight in zip(source, target, df[sankey_weight]):
            links.append(
                (
                    (
                        link_mappings[source_col][v1],
                        link_mappings[target_col][v2]
    ),
    link_weight,
    )
            )

    # dataframe for the links and weights
    dflinks = pd.DataFrame(links, columns=["link", "weight"])

    # aggregating the same links into a single link (by weight)
    dflinks = dflinks.groupby(by=["link"], as_index=False).agg({"weight": sum})

    # generating three lists needed for the sankey visual
    sources = [val[0] for val in dflinks["link"]]
    targets = [val[1] for val in dflinks["link"]]
    weights = dflinks["weight"]

    return labels, sources, targets, weights

In [39]:
#lets call the function and plot!

#specify columns we want to plot
columns = ['icv', 'level1', 'level2', 'level3', 'level4', 'roi']

#call sankey function for subject df, with columns and composition as weight
labels, sources, targets, weights = gensankey(subject, columns, 'comp') 

#copy in code for sankey plot but replace label with labels, source with sources, 
#target with targets, and value with weights

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
    ),
    link = dict(
      source = sources,
      target = targets,
      value = weights,
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()