In [2]:
# libraries
import plotly
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

warnings.simplefilter(action='ignore', category=FutureWarning) # ignores warnings from pd.append()

In [3]:
# import csv
df = pd.read_csv('dff.csv')
df.tail()

Unnamed: 0,Titel,Bevilliget beløb,Modtager,Institution,Virkemidler,Område,År,Beskrivelse,Region
4385,Enabling Ultra Deep Hydrodesulphurization by N...,10781874,Ib Chorkendorff,Danmarks Tekniske Universitet,Øvrige forskningsprojekter,Teknologi og Produktion,2013,Alle olieprodukter renses i dag for svovl for ...,Region Hovedstaden
4386,Acute stroke research,717359,Hanne Krarup Christensen,"Bispebjerg Hospital, Neurologisk Afdeling",Delestillinger,Sundhed og Sygdom,2013,Aktuelle ansøgning angår frikøb af overlæge Ha...,Region Hovedstaden
4387,Atherosclerotic cardiovascular disease in HIV-...,764683,Anne-Mette Lebech,"Hvidovre Hospital, Infektionsmedicinsk Afdeling",Delestillinger,Sundhed og Sygdom,2013,Behandling af HIV positive patienter med anti-...,Region Hovedstaden
4388,Epigenetic modulation of mechanisms involved i...,829294,Ole Schmeltz Søgaard,Aarhus Universitetshospital,Delestillinger,Sundhed og Sygdom,2013,HIV infektion behandles i dag med en kombinati...,Region Midtjylland
4389,Novel mechanisms of insulin resistance and mit...,665923,Kurt Højlund,"Odense Universitetshospital, Endokrinologisk A...",Delestillinger,Sundhed og Sygdom,2013,Insulinresistens (IR) i muskelvæv spiller en v...,Region Syddanmark


In [4]:

def generateSankey(df, year, category_columns):
    df.tail()

    colorpalette = px.colors.qualitative.Plotly

    # data for sankey
    df = df.loc[df['År'] == year]
    df_sankey = df.loc[:,category_columns + ['Bevilliget beløb']]

    # create list of labels, i.e. unique values from each column except the values
    # create color list
    labels = []
    colornumlist = []

    for col in category_columns:
        labels = labels + list(set(df_sankey[col].values)) # adds unique labels in each category to list
        colornumlist.append(len(list(set(df_sankey[col].values)))) # appends number of unique labels for each category

    # define colors based on number of categories
    colorList = []
    for idx, colorNum in enumerate(colornumlist):
        colorList = colorList + [colorpalette[idx]]*colorNum

    # initiate input for for loop
    df_link_input = pd.DataFrame({'source' : [], 'target': [], 'count': []})

    # create data for go.Sankey function
    for i in range(len(category_columns)-1):
        if len(category_columns) == 1:
            print("Number of input categories must be at least 2")
        else:
            temporary_df = df_sankey.groupby([category_columns[i], category_columns[i+1]]).agg({'Bevilliget beløb':'sum'}).reset_index() # loop over columns and group by column to the right, i.e. 'År' and 'Virkemidler', and then 'Virkemidler' and 'Område'
            temporary_df.columns = ['source','target','count']
            df_link_input = df_link_input.append(temporary_df)

    # add index for source-target pair
    df_link_input['sourceID'] = df_link_input['source'].apply(lambda x: labels.index(x))
    df_link_input['targetID'] = df_link_input['target'].apply(lambda x: labels.index(x))

    # creating the sankey diagram
    fig = go.Figure(data=[go.Sankey(
        valueformat = ",",
        valuesuffix = " kr.",
        # define nodes
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            color = colorList
            ),
        link = dict(
            source = df_link_input['sourceID'], # indices correspond to labels, e.g. '2022', 'Forskningsprojekt 1', 'Forskningsprojekt 2', ...
            target = df_link_input['targetID'],
            value = df_link_input['count']
        ))])

    fig.update_layout(title_text="Funding of Research Grants in " + str(year) + "<br>Source: <a href='https://dff.dk/'>Danmarks Frie Forskningsfond</a>",
                        font_size=10)
    fig.show()

In [5]:

# NODES UDE TIL HØJRE SKAL SORTERES I FALDENDE ORDEN

# plotting sankey diagram
generateSankey(df, year = 2022, category_columns = ['År','Virkemidler', 'Område'])