# Installations

In [None]:
# For use in Colab only
%pip install spacy
%pip install --no-deps sentence-transformers
%pip install xlsxwriter

In [None]:
import spacy

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import minmax_scale

import plotly.express as px
from plotly.graph_objs._figure import Figure
import plotly.graph_objects as go
from plotly.express.colors import sample_colorscale
import pandas as pd
import numpy as np

pd.options.mode.copy_on_write = True


Collecting xlsxwriter
  Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.0


# Data

In [None]:
# get the items as tokens
df_tokens = pd.read_csv('Decision making analysis.csv', sep = ';')
# remove duplicates
df_tokens = df_tokens.drop_duplicates(ignore_index = True)
df_tokens['Item'] = df_tokens['Item'].str.lower()

print(df_tokens.shape)
display(df_tokens.head())

(170, 4)


Unnamed: 0,Decision analysis,Category,Domain,Item
0,Characteristics,Action Type,Air Traffic,preventive or corrective
1,Characteristics,Action Type,Electricity,preventive or corrective
2,Characteristics,Action Type,Railway,preventive (operational adjustments)
3,Characteristics,Implementation,Air Traffic,planed or real-time
4,Characteristics,Implementation,Electricity,planed or real-time


# Huggingface model

Pick a model from the [following list](https://huggingface.co/models?pipeline_tag=sentence-similarity&language=en&sort=trending)

Models are picked from the [sbert.net](https://www.sbert.net/index.html) sentence transformer library, and more specifically from [pretrained semantic search models](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html#semantic-search-models)

SPECTER is a model trained on scientific citations and can be used to estimate the similarity of two publications. We can use it to find similar papers.

In [None]:
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
# model_name = 'sentence-transformers/all-distilroberta-v1'
model_name = 'sentence-transformers/allenai-specter'
model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/462k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# calculate embeddings with selected model
tokens_embeddings = model.encode(df_tokens['Item'], show_progress_bar=True)

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

# Embedding plotting

In [None]:
# function to plot an embedding on a 2-dimension space
def get_plot_data_embeddings(embeddings):
    # Reduce embeddings dimension to plot it on a 2D graph
    pca = PCA(n_components = 2)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Get x and y coordinates
    x_coords = [point[0] for point in reduced_embeddings]
    y_coords = [point[1] for point in reduced_embeddings]

    df = pd.DataFrame({'x': x_coords, 'y': y_coords})

    return df

In [None]:
# function to make the plot
def get_plot_embedding(df_tokens, tokens_embeddings):
  # calculate 2D plotting data from embeddings
  plot_data_embeddings = get_plot_data_embeddings(tokens_embeddings)
  # calculate max coordinates for further plotting
  xmin, xmax = plot_data_embeddings['x'].min(), plot_data_embeddings['x'].max()
  ymin, ymax = plot_data_embeddings['y'].min(), plot_data_embeddings['y'].max()
  axis_max = max(abs(xmax), abs(xmin), abs(ymax), abs(ymin))

  plot_data_items = pd.concat([df_tokens, plot_data_embeddings], axis=1)
  plot_data_items['dummy_column_for_size'] = 1
  plot_margin = 4

  fig = px.scatter(plot_data_items,
                  x = 'x',
                  y = 'y',
                  # facet_row= 'Category',
                  # facet_col_wrap=1,
                  text = 'Item',
                  size = 'dummy_column_for_size',
                  color='Domain',
                  color_discrete_map={
                      "ATM": "#a6cee3",
                      "Railway": "#b2df8a",
                      "Electricity": "#fdbf6f"})
  fig.update_layout({
      'plot_bgcolor': 'rgba(0, 0, 0, 0)',
      'paper_bgcolor': 'rgba(0, 0, 0, 0)',
      #'autosize' : False,
      'width' : 500,
      'height' : 500
  })
  fig.update_layout(
      xaxis_range=[-axis_max - plot_margin, axis_max + plot_margin],
      yaxis_range=[-axis_max - plot_margin, axis_max + plot_margin],
      margin=dict(
          l=20,
          r=20,
          b=20,
          t=20,
          pad=4
      ),
      legend=dict(
          title = None,

          orientation="h",
          yanchor="bottom",
          y=1.02,
          xanchor="right",
          x=1
  ))
  fig.update_traces(textposition='top center')
  fig.update_xaxes(title='', showticklabels=False, showline=True, linewidth=1,
          linecolor='black', mirror=True)
  fig.update_yaxes(title='', showticklabels=False, showline=True, linewidth=1,
          linecolor='black', mirror=True)
  fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top center'))

  return fig

In [None]:
df_tokens_plot = df_tokens[df_tokens['Decision analysis'] == 'Characteristics'].reset_index(drop=True)
tokens_embeddings_plot = model.encode(df_tokens_plot['Item'])
fig = get_plot_embedding(df_tokens_plot, tokens_embeddings_plot)
fig.show()

# Similarity analysis

## Similarity pairs

In [None]:
# calculate all pairs of cosine similarity
similarity_scores = cosine_similarity(tokens_embeddings).squeeze()


In [None]:
# calculate all pairs of items by crossing the pandas
df_tokens_1 = df_tokens.copy()
df_tokens_1.columns = [column_name + '_1' for column_name in df_tokens.columns]
df_tokens_2 = df_tokens.copy()
df_tokens_2.columns = [column_name + '_2' for column_name in df_tokens.columns]
df_tokens_crossed = df_tokens_1.merge(df_tokens_2, how='cross')
print(df_tokens_crossed.shape)
display(df_tokens_crossed.head())

(28900, 8)


Unnamed: 0,Decision analysis_1,Category_1,Domain_1,Item_1,Decision analysis_2,Category_2,Domain_2,Item_2
0,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Air Traffic,preventive or corrective
1,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Electricity,preventive or corrective
2,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Railway,preventive (operational adjustments)
3,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Implementation,Air Traffic,planed or real-time
4,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Implementation,Electricity,planed or real-time


In [None]:
# add similarity scores calculated for each pair
df_tokens_crossed['similarity_score'] = similarity_scores.reshape(-1)
print(df_tokens_crossed.shape)
display(df_tokens_crossed.head())

(28900, 9)


Unnamed: 0,Decision analysis_1,Category_1,Domain_1,Item_1,Decision analysis_2,Category_2,Domain_2,Item_2,similarity_score
0,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Air Traffic,preventive or corrective,1.0
1,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Electricity,preventive or corrective,1.0
2,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Action Type,Railway,preventive (operational adjustments),0.845557
3,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Implementation,Air Traffic,planed or real-time,0.741233
4,Characteristics,Action Type,Air Traffic,preventive or corrective,Characteristics,Implementation,Electricity,planed or real-time,0.741233


In [None]:
# keep pairs only within a decision analysis/category
df_tokens_crossed = df_tokens_crossed[
    (df_tokens_crossed['Category_1'] == df_tokens_crossed['Category_2']) &
    (df_tokens_crossed['Decision analysis_1'] == df_tokens_crossed['Decision analysis_2'])
]
# filter out all duplicated lines (where item, decision analysis and category are the same for a pair)
df_tokens_crossed = df_tokens_crossed[
    (df_tokens_crossed['Item_1'] != df_tokens_crossed['Item_2']) |
     (df_tokens_crossed['Domain_1'] != df_tokens_crossed['Domain_2'])]
df_tokens_crossed = df_tokens_crossed.drop(columns = ['Category_2', 'Decision analysis_2'])
df_tokens_crossed.columns = ['Decision analysis', 'Category',	'Domain_1', 'Item_1', 'Domain_2', 'Item_2', 'similarity_score']

print(df_tokens_crossed.shape)
display(df_tokens_crossed.head())

(1496, 7)


Unnamed: 0,Decision analysis,Category,Domain_1,Item_1,Domain_2,Item_2,similarity_score
1,Characteristics,Action Type,Air Traffic,preventive or corrective,Electricity,preventive or corrective,1.0
2,Characteristics,Action Type,Air Traffic,preventive or corrective,Railway,preventive (operational adjustments),0.845557
170,Characteristics,Action Type,Electricity,preventive or corrective,Air Traffic,preventive or corrective,1.0
172,Characteristics,Action Type,Electricity,preventive or corrective,Railway,preventive (operational adjustments),0.845557
340,Characteristics,Action Type,Railway,preventive (operational adjustments),Air Traffic,preventive or corrective,0.845557


In [None]:
# plot monotonic curve of all scores to get an idea of threshold
all_scores = df_tokens_crossed['similarity_score'].array
fig = px.line(
    x = (np.arange(len(all_scores))/len(all_scores))*100,
    y = np.sort(all_scores)[::-1])

fig.update_layout({
    'plot_bgcolor': 'rgba(255, 255, 0, 0)',
    #'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    'width' : 600,
    'height' : 500
})
fig.update_xaxes(title='% of cases', range = [0, 100])
fig.update_yaxes(title='similarity score', range = [0, 1])
fig.show()

In [None]:
# set the threshold for considering items as "similar enough"
threshold_similarity = 0.80
# pairs with a similarity score above the threshold are tagged "1" else "0"
df_tokens_crossed['threshold'] = 0
df_tokens_crossed.loc[df_tokens_crossed['similarity_score'] >= threshold_similarity, 'threshold'] = 1

print(df_tokens_crossed.shape)
display(df_tokens_crossed.head())

(1496, 8)


Unnamed: 0,Decision analysis,Category,Domain_1,Item_1,Domain_2,Item_2,similarity_score,threshold
1,Characteristics,Action Type,Air Traffic,preventive or corrective,Electricity,preventive or corrective,1.0,1
2,Characteristics,Action Type,Air Traffic,preventive or corrective,Railway,preventive (operational adjustments),0.845557,1
170,Characteristics,Action Type,Electricity,preventive or corrective,Air Traffic,preventive or corrective,1.0,1
172,Characteristics,Action Type,Electricity,preventive or corrective,Railway,preventive (operational adjustments),0.845557,1
340,Characteristics,Action Type,Railway,preventive (operational adjustments),Air Traffic,preventive or corrective,0.845557,1


In [None]:
# save file
df_tokens_crossed.to_excel('df_tokens_crossed.xlsx')

## Plot list of items


In [None]:
df_tokens_plot = df_tokens_crossed[(df_tokens_crossed['threshold'] == 1) & (df_tokens_crossed['Decision analysis'] == 'Context')].reset_index(drop=True)
df_tokens_plot['Item'] = df_tokens_plot['Item_1']
df_tokens_plot['Domain'] = df_tokens_plot['Domain_1']
tokens_embeddings_plot = model.encode(df_tokens_plot['Item'])
fig = get_plot_embedding(df_tokens_plot, tokens_embeddings_plot)
fig.show()

## Crossed list of item pairs

List of all item pairs with similarity score

In [None]:
category_summary_item_pairs = df_tokens_crossed[df_tokens_crossed['threshold'] == 1]
category_summary_item_pairs = category_summary_item_pairs[
    ((category_summary_item_pairs['Domain_1'] == 'Air Traffic') & (category_summary_item_pairs['Domain_2'] == 'Electricity')) |
    ((category_summary_item_pairs['Domain_1'] == 'Electricity') & (category_summary_item_pairs['Domain_2'] == 'Railway')) |
    ((category_summary_item_pairs['Domain_1'] == 'Railway') & (category_summary_item_pairs['Domain_2'] == 'Air Traffic'))
]
category_summary_item_pairs['crossed_domain'] = category_summary_item_pairs['Domain_1'] + '-' + category_summary_item_pairs['Domain_2']
category_summary_item_pairs.drop(columns = ['Domain_1', 'Domain_2'], inplace = True)

category_summary_item_pairs['item_couple'] = category_summary_item_pairs.apply(lambda row: '('+ row['Item_1'] + ', ' + row['Item_2'] + ')' if row['Item_1'] != row['Item_2'] else row['Item_1'], axis = 1)

print(category_summary_items.shape)
display(category_summary_items.head())

(227, 8)


Unnamed: 0,Decision analysis,Category,Item_1,Item_2,similarity_score,threshold,crossed_domain,item_couple
1,Characteristics,Action Type,preventive or corrective,preventive or corrective,1.0,1,Air Traffic-Electricity,preventive or corrective
169,Characteristics,Action Type,preventive or corrective,preventive (operational adjustments),0.845557,1,Electricity-Railway,"(preventive or corrective , preventive (operat..."
334,Characteristics,Action Type,preventive (operational adjustments),preventive or corrective,0.845557,1,Railway-Air Traffic,"(preventive (operational adjustments) , preven..."
505,Characteristics,Implementation,planed or real-time,planed or real-time,1.0,1,Air Traffic-Electricity,planed or real-time
673,Characteristics,Implementation,planed or real-time,real-time,0.947044,1,Electricity-Railway,"(planed or real-time , real-time)"


In [None]:
category_summary_item_pairs.to_excel('category_summary_item_pairs.xlsx')


## Summary per category

### Crossed list of items

For each pair of domain, list of items who have a similarity higher than a threshold

List items that are shared by all domain pairs

In [None]:
category_summary_items_simple = df_tokens_crossed[df_tokens_crossed['threshold'] == 1]
category_summary_items_simple = category_summary_items_simple[
    ((category_summary_items_simple['Domain_1'] == 'Air Traffic') & (category_summary_items_simple['Domain_2'] == 'Electricity')) |
    ((category_summary_items_simple['Domain_1'] == 'Electricity') & (category_summary_items_simple['Domain_2'] == 'Railway')) |
    ((category_summary_items_simple['Domain_1'] == 'Railway') & (category_summary_items_simple['Domain_2'] == 'Air Traffic'))
]
category_summary_items_simple['crossed_domain'] = category_summary_items_simple['Domain_1'] + '-' + category_summary_items_simple['Domain_2']
category_summary_items_simple.drop(columns = ['Domain_1', 'Domain_2'], inplace = True)
category_summary_items_simple = category_summary_items_simple.groupby(['Decision analysis', 'Category', 'crossed_domain']).agg({'Item_1' : lambda x: set(x), 'Item_2' : lambda x: set(x)}).reset_index()
category_summary_items_simple['items'] = category_summary_items_simple.apply(lambda row: row['Item_1'].union(row['Item_2']), axis = 1)
category_summary_items_simple.drop(columns = ['Item_1', 'Item_2'], inplace = True)

# pivot per domain pairs
category_summary_items_simple = category_summary_items_simple.pivot(
    index=['Decision analysis', 'Category'], columns='crossed_domain', values='items').reset_index().rename_axis(None, axis=1)

# fill empty cells
for column in ['Air Traffic-Electricity', 'Electricity-Railway', 'Railway-Air Traffic']:
  category_summary_items_simple[column] = [item if type(item) == set else set([]) for item in category_summary_items_simple[column]]

# add 'all' column with items common to all domains
category_summary_items_simple['All'] = category_summary_items_simple.apply(lambda row: row['Air Traffic-Electricity'].intersection(row['Electricity-Railway'], row['Railway-Air Traffic']), axis = 1)

# formatting
for column in ['Air Traffic-Electricity', 'Electricity-Railway', 'Railway-Air Traffic', 'All']:
  category_summary_items_simple[column] = category_summary_items_simple[column].apply(lambda x: ',\n'.join(list(x)))

print(category_summary_items_simple.shape)
display(category_summary_items_simple.head())

(20, 6)


Unnamed: 0,Decision analysis,Category,Air Traffic-Electricity,Electricity-Railway,Railway-Air Traffic,All
0,Characteristics,Action Type,preventive or corrective,"preventive or corrective ,\npreventive (operat...","preventive or corrective ,\npreventive (operat...",preventive or corrective
1,Characteristics,Implementation,planed or real-time,"real-time,\nplaned or real-time","real-time,\nplaned or real-time",planed or real-time
2,Characteristics,Size of action space,large and mixed action space,large and mixed action space,large and mixed action space,large and mixed action space
3,Characteristics,Time Constraints,,,"operational adjustments,\nstrategic planning,\...",
4,Characteristics,Time step,"real-time to medium-term,\nreal-time to long-term",real-time to long-term,"real-time to medium-term,\nreal-time to long-term",real-time to long-term


In [None]:
category_summary_items_simple.to_excel('category_summary_items_simple.xlsx')
with pd.ExcelWriter('category_summary_items_simple.xlsx', engine='xlsxwriter') as writer:
    category_summary_items_simple.to_excel(writer, sheet_name='Sheet1')
    workbook  = writer.book
    worksheet = writer.sheets['Sheet1']
    cell_format = workbook.add_format({'text_wrap': True})
    worksheet.set_column('A:Z', cell_format=cell_format)

### Cross-domain analysis

In [None]:
category_cross_analysis = df_tokens_crossed[
    ((df_tokens_crossed['Domain_1'] == 'Air Traffic') & (df_tokens_crossed['Domain_2'] == 'Electricity')) |
    ((df_tokens_crossed['Domain_1'] == 'Electricity') & (df_tokens_crossed['Domain_2'] == 'Railway')) |
    ((df_tokens_crossed['Domain_1'] == 'Railway') & (df_tokens_crossed['Domain_2'] == 'Air Traffic'))]

category_cross_analysis['crossed_domain'] = category_cross_analysis['Domain_1'] + '-' + category_cross_analysis['Domain_2']
category_cross_analysis.drop(columns = ['Domain_1', 'Domain_2'], inplace = True)
category_cross_analysis = category_cross_analysis[
    ['Decision analysis', 'Category', 'crossed_domain', 'threshold']
].groupby(['Decision analysis', 'Category', 'crossed_domain']).agg({'threshold': lambda x: int(round(100*np.average(x), 2))}).reset_index().pivot(
    index=['Decision analysis', 'Category'], columns='crossed_domain', values='threshold').reset_index().rename_axis(None, axis=1)
category_cross_analysis.fillna(int(0), inplace=True)

print(category_cross_analysis.shape)
display(category_cross_analysis.head())

(22, 5)


Unnamed: 0,Decision analysis,Category,Air Traffic-Electricity,Electricity-Railway,Railway-Air Traffic
0,Characteristics,Action Type,100.0,100.0,100.0
1,Characteristics,Implementation,100.0,100.0,100.0
2,Characteristics,Size of action space,100.0,100.0,100.0
3,Characteristics,Time Constraints,0.0,0.0,30.0
4,Characteristics,Time step,100.0,100.0,100.0


In [None]:
category_cross_analysis.set_index(['Decision analysis', 'Category']).style.background_gradient(cmap = 'Reds')


Unnamed: 0_level_0,Unnamed: 1_level_0,Air Traffic-Electricity,Electricity-Railway,Railway-Air Traffic
Decision analysis,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Characteristics,Action Type,100.0,100.0,100.0
Characteristics,Implementation,100.0,100.0,100.0
Characteristics,Size of action space,100.0,100.0,100.0
Characteristics,Time Constraints,0.0,0.0,30.0
Characteristics,Time step,100.0,100.0,100.0
Characteristics,Trade-offs,0.0,50.0,100.0
Context,Constraints,16.0,0.0,0.0
Context,Forecasting,0.0,15.0,5.0
Context,Network capacity,0.0,0.0,0.0
Context,Observations,50.0,16.0,8.0


In [None]:
# to export styled panda to Excel
category_cross_analysis.set_index(['Decision analysis', 'Category']).style.background_gradient(cmap = 'Reds').to_excel('category_cross_analysis.xlsx')

In [None]:
analysis_cross_analysis = df_tokens_crossed[
    ((df_tokens_crossed['Domain_1'] == 'Air Traffic') & (df_tokens_crossed['Domain_2'] == 'Electricity')) |
    ((df_tokens_crossed['Domain_1'] == 'Electricity') & (df_tokens_crossed['Domain_2'] == 'Railway')) |
    ((df_tokens_crossed['Domain_1'] == 'Railway') & (df_tokens_crossed['Domain_2'] == 'Air Traffic'))]

analysis_cross_analysis['crossed_domain'] = analysis_cross_analysis['Domain_1'] + '-' + analysis_cross_analysis['Domain_2']
analysis_cross_analysis.drop(columns = ['Domain_1', 'Domain_2'], inplace = True)
analysis_cross_analysis = analysis_cross_analysis[
    ['Decision analysis', 'crossed_domain', 'threshold']
].groupby(['Decision analysis', 'crossed_domain']).agg({'threshold': lambda x: int(round(100*np.average(x), 2))}).reset_index().pivot(
    index=['Decision analysis', ], columns='crossed_domain', values='threshold').reset_index().rename_axis(None, axis=1)
analysis_cross_analysis.fillna(0, inplace=True)

print(analysis_cross_analysis.shape)
display(analysis_cross_analysis.head())

(4, 4)


Unnamed: 0,Decision analysis,Air Traffic-Electricity,Electricity-Railway,Railway-Air Traffic
0,Characteristics,40,50,50
1,Context,13,13,12
2,Evaluation (KPIs),23,38,46
3,Impacts,0,4,0


In [None]:
analysis_cross_analysis.style.background_gradient(cmap = 'Reds')


Unnamed: 0,Decision analysis,Air Traffic-Electricity,Electricity-Railway,Railway-Air Traffic
0,Characteristics,40,50,50
1,Context,13,13,12
2,Evaluation (KPIs),23,38,46
3,Impacts,0,4,0


In [None]:
# to export styled panda to Excel
analysis_cross_analysis.style.background_gradient(cmap = 'Reds').to_excel('analysis_cross_analysis.xlsx')

### Domain level

In [None]:
domain_cross_analysis = df_tokens_crossed[
    ['Domain_1', 'Domain_2', 'threshold']
].groupby(['Domain_1', 'Domain_2']).mean().reset_index().pivot(
    index='Domain_1', columns='Domain_2', values='threshold').reset_index().rename_axis(None, axis=1)

domain_cross_analysis_plot_data = np.array(domain_cross_analysis.iloc[:, 1:])
# domain_cross_analysis_plot_data = np.tril(np.array(domain_cross_analysis.iloc[:, 1:], dtype=np.float32))
# domain_cross_analysis_plot_data[np.triu_indices(domain_cross_analysis_plot_data.shape[0], 1)] = np.nan

fig = px.imshow(
    domain_cross_analysis_plot_data,
    x = domain_cross_analysis['Domain_1'],
    y = domain_cross_analysis['Domain_1'],
    color_continuous_scale = 'reds')
fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

## Donuts

/!\ experimental only

In [None]:
# function that creates a donut where color corresponds to cross-domain distance
def create_crossdomain_donut(values, min_scale, max_scale, palette = 'reds', labels = ['Air Traffic - Electricity','Electricity - Railway', 'Railway - Air Traffic']):
  colors_ = [min_scale] + values + [max_scale]
  colors_scaled = minmax_scale(colors_)
  discrete_colors = sample_colorscale(palette, np.clip(colors_scaled, 0, 1))
  plot_values = [1/value if value != 0 else 20 for value in values]

  go_obj = go.Pie(labels=[label + '<br>' + format(value, ".0%") for label, value in zip(labels, values)], values=plot_values, textinfo='label', hole=.5)

  fig = go.Figure(data=[go_obj])
  fig.update_traces(marker=dict(colors=discrete_colors[1:-1]))
  fig.update_layout({
      'plot_bgcolor': 'rgba(0, 0, 0, 0)',
      'paper_bgcolor': 'rgba(0, 0, 0, 0)',
      #'autosize' : False,
      'width' : 500,
      'height' : 500
  })
  fig.update_layout(
      margin=dict(
          l=20,
          r=20,
          b=20,
          t=30,
          pad=4
      )
  )
  return go_obj, fig

### Domain level

In [None]:
_, fig = create_crossdomain_donut(
    (np.array(domain_cross_analysis.iloc[:,-3:])[[0,1, 2], [1, 2, 0]]).tolist(),
    domain_cross_analysis.iloc[:,-3:].min().min(),
    domain_cross_analysis.iloc[:,-3:].max().max())
fig.update_layout(title = '', showlegend = False)
fig.show()

### Analysis level

In [None]:
analysis_domain_cross_analysis = df_tokens_crossed[df_tokens_crossed['Decision analysis'] == 'Context'][
    ['Domain_1', 'Domain_2', 'threshold']
].groupby(['Domain_1', 'Domain_2']).mean().reset_index().pivot(
    index='Domain_1', columns='Domain_2', values='threshold').reset_index().rename_axis(None, axis=1)

_, fig = create_crossdomain_donut(
    (np.array(analysis_domain_cross_analysis.iloc[:,-3:])[[0,1, 2], [1, 2, 0]]).tolist(),
    0, 1)
fig.update_layout(title = '', showlegend = False)
fig.show()

In [None]:
analysis_domain_cross_analysis = df_tokens_crossed[df_tokens_crossed['Decision analysis'] == 'Characteristics'][
    ['Domain_1', 'Domain_2', 'threshold']
].groupby(['Domain_1', 'Domain_2']).mean().reset_index().pivot(
    index='Domain_1', columns='Domain_2', values='threshold').reset_index().rename_axis(None, axis=1)

_, fig = create_crossdomain_donut(
    (np.array(analysis_domain_cross_analysis.iloc[:,-3:])[[0,1, 2], [1, 2, 0]]).tolist(),
    0, 1)
fig.update_layout(title = '', showlegend = False)
fig.show()

In [None]:
analysis_domain_cross_analysis = df_tokens_crossed[df_tokens_crossed['Decision analysis'] == 'Impacts'][
    ['Domain_1', 'Domain_2', 'threshold']
].groupby(['Domain_1', 'Domain_2']).mean().reset_index().pivot(
    index='Domain_1', columns='Domain_2', values='threshold').reset_index().rename_axis(None, axis=1)

_, fig = create_crossdomain_donut(
    (np.array(analysis_domain_cross_analysis.iloc[:,-3:])[[0,1, 2], [1, 2, 0]]).tolist(),
    0, 1)
fig.update_layout(title = '', showlegend = False)
fig.show()

In [None]:
analysis_domain_cross_analysis = df_tokens_crossed[df_tokens_crossed['Decision analysis'] == 'Evaluation (KPIs)'][
    ['Domain_1', 'Domain_2', 'threshold']
].groupby(['Domain_1', 'Domain_2']).mean().reset_index().pivot(
    index='Domain_1', columns='Domain_2', values='threshold').reset_index().rename_axis(None, axis=1)

_, fig = create_crossdomain_donut(
    (np.array(analysis_domain_cross_analysis.iloc[:,-3:])[[0,1, 2], [1, 2, 0]]).tolist(),
    0, 1)
fig.update_layout(title = '', showlegend = False)
fig.show()