<a href="https://www.kaggle.com/code/carlosmorenogarcia/sankey?scriptVersionId=274245896" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Creating Sankey Diagrams using R

* Code source: https://rpubs.com/YJ_Choi/FPDynamicsData
* Adding title to Sankeys: https://stackoverflow.com/questions/50132459/how-to-add-title-to-a-networkd3-visualisation-when-saving-as-a-web-page


In [1]:
## Load libraries
library(tidyverse)
library(networkD3)
library(reshape2)
library(ggplot2)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     


── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors



Attaching package: ‘reshape2’




The following object is masked from ‘package:tidyr’:

    smiths




In [2]:
## Load data
data <- read.csv("/kaggle/input/sankey-db-csv/sankey.csv")

## Sankey Diagram with Two Columns

In [3]:
## Get unique values from the two columns
a=sort(unique(data$Gender))
b=sort(unique(data$Household_Income))

In [4]:
## Create a zeros matrix
mat = matrix(0, length(a), length(b))
rownames(mat) <- sort(a)
colnames(mat) <- sort(b)

In [5]:
## Count the number of times one value is related to the other
for (x in 1:nrow(data)){
    row = data[x,]
    m = row$Gender
    n = row$Household_Income
    if (is.na(m)==FALSE & is.na(n)==FALSE){
      mat[m,n]=mat[m,n]+1
    }
}

In [6]:
## Convert mat into a data frame (easier to handle)
mat=as.data.frame(mat)

In [7]:
## Reshape data to long format
data_long <- mat %>%
  rownames_to_column %>%
  gather(key = 'key', value = 'value', -rowname) %>%
  filter(value > 0)
colnames(data_long) <- c("source", "target", "value")

In [8]:
## From these flows we need to create a node data frame:
# it lists every entities involved in the flow
nodes <- data.frame(name=c(as.character(data_long$source),
                           as.character(data_long$target)) %>% unique())

In [9]:
## With networkD3, connection must be provided using id,
# This allows us to know who gets connected
# Notice the -1, our IDs will go from 0 to n (JavaScript)
data_long$IDsource=match(data_long$source, nodes$name)-1
data_long$IDtarget=match(data_long$target, nodes$name)-1

In [10]:
## Make the network
# set "iterations=0" to avoid automatic assignment of the box order
sankey<- sankeyNetwork(Links = data_long, Nodes = nodes,
              Source = "IDsource", Target = "IDtarget",
              Value = "value", NodeID = "name",
              sinksRight=FALSE, nodeWidth=40, fontSize=13,
              nodePadding=20, iterations=0)

In [11]:
## Display the HTML widget
# This will only work in environments where HTML widgets are supported
# (e.g., RStudio or a Jupyter notebook with IRkernel)
htmlwidgets::saveWidget(sankey, file = "sankey_plot.html", selfcontained = FALSE)
IRdisplay::display_html(paste("<iframe src='sankey_plot.html' width='800' height='600'></iframe>"))

## Sankey Diagram with Three Columns

In [12]:
## Add a third column
c=sort(unique(data$`Would.you.stop.eating.meat.`))

In [13]:
## Create a new zeros matrix
mat2 = matrix(0, length(b), length(c))
rownames(mat2) <- b
colnames(mat2) <- c

In [14]:
## Another count
for (x in 1:nrow(data)){
    row = data[x,]
    m = row$Household_Income
    n = row$`Would.you.stop.eating.meat.`
    if (is.na(m)==FALSE & is.na(n)==FALSE){
      mat2[m,n]=mat2[m,n]+1
    }
  } 

In [15]:
## As data frame
mat2=as.data.frame(mat2)

In [16]:
## Reshape data to long format 
data_long2 <- mat2 %>%
  rownames_to_column %>%
  gather(key = 'key', value = 'value', -rowname) %>%
  filter(value > 0)
colnames(data_long2) <- c("source", "target", "value")

In [17]:
## Create node data
nodes2 <- data.frame(name=c(as.character(data_long2$source),
                           as.character(data_long2$target)) %>%unique())

In [18]:
## Source & target
data_long2$IDsource=match(data_long2$source, nodes2$name)-1 
data_long2$IDtarget=match(data_long2$target, nodes2$name)-1

In [19]:
## Make the Three Column Diagram
newnodes_col3 <- data.frame(name=c(as.character(data_long2$target)) %>%
                               unique())
nodes_3cols <-rbind(nodes, newnodes_col3)
data_long_3cols <-rbind(data_long, data_long2)
# Remove the previous matching
data_long_3cols <- subset(data_long_3cols, select = c("source", "target", "value"))
# New matching
data_long_3cols$IDsource=match(data_long_3cols$source, nodes_3cols$name)-1
data_long_3cols$IDtarget=match(data_long_3cols$target, nodes_3cols$name)-1

In [20]:
## Make the new network
sankey2<- sankeyNetwork(Links = data_long_3cols, Nodes = nodes_3cols,
                        Source = "IDsource", Target = "IDtarget",
                        Value = "value", NodeID = "name", 
                        sinksRight=FALSE, nodeWidth=40, fontSize=13, 
                        nodePadding=20, iterations=0)

In [21]:
## Display the HTML widget
# This will only work in environments where HTML widgets are supported
# (e.g., RStudio or a Jupyter notebook with IRkernel)
htmlwidgets::saveWidget(sankey2, file = "sankey2_plot.html", selfcontained = FALSE)
IRdisplay::display_html(paste("<iframe src='sankey2_plot.html' width='800' height='600'></iframe>"))

The end