# Load and modify data

In [1]:
# Step 1: find occurrences of instances in bag of words of papers
import pandas as pd
import os
import json
import numpy as np

def csv_to_dict_of_sets(csv_file):
    dict_of_sets = {}
    # try:
    #     df = pd.read_csv(csv_file)
    # except pd.errors.ParserError:
    #     print("Error parsing CSV file. Trying again with 'error_bad_lines=False'")
    try:
        df = pd.read_csv(csv_file, on_bad_lines='warn', delimiter=";",  encoding="utf8")
    except:
        print("Error parsing CSV file. Trying again with 'encoding=ISO-8859-1'")
        df = pd.read_csv(csv_file, on_bad_lines='warn', delimiter=";", encoding='ISO-8859-1')
    for column in df.columns:
        dict_of_sets[column] = set(df[column].str.lower())
    # saved_column = df['process'] #you can also use df['column_name']
    # delete all that exists in two or more columns
    for key in dict_of_sets:
        for other_key in dict_of_sets:
            if key != other_key:
                dict_of_sets[key] = dict_of_sets[key].difference(dict_of_sets[other_key])
    return dict_of_sets

def count_occurrences(papers, instances):
    occurrences = np.zeros((len(papers), len(instances)), dtype=int)

    for p, paperpath in enumerate(papers.values()):
        with open(paperpath, 'r', encoding="utf8") as f:
            paper = json.load(f)
            for i, instance in enumerate(instances):
                present = True
                pieces = instance.split(' ')
                for piece in pieces:
                    if piece.lower() not in paper['bag_of_words']:
                        present = False
                        break
                    
                # if instance == "system integration":
                #     if "Liu und Hu - 2013 - A reuse oriented representation model for capturin" in paperpath:
                #         print(present)
                if present:
                    occurrences[p][i] = 1
    return occurrences

# ---------------------- Variables ----------------------

## instances: A list of all instances, regardless of their type
# first all type 1, then all type 2, etc.
# if possible, instance sare ordered by their occurrence
instances = []

## instances_dicts: A dictionary of all different types (columns) of instances
#
# types:
#  - process
#  - software
#  - data item
#  - data model
#  - data format specification
#  - interchange format
#  - source
#
# instances_dicts['process']: A set of all instances of the type 'process'
#
instance_types_dicts = {}

## paper_nlp_dict: A dictionary of all papers and their NLP data (as dict)
paper_nlp_paths = {}

## occurrences: A matrix of binary occurrences of instances in papers
#
# rows: papers
# columns: instances
# cells: 1 if instance is present in paper, 0 otherwise
#
paper_instance_occurrence_matrix = np.zeros((), dtype=int)

csv_file = 'C:/workspace/borgnetzwerk/tools/scripts/SLR/data.csv'
paperspath = 'G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/02_nlp'

# ---------------------- Main ----------------------

# Usage example

instance_types_dicts = csv_to_dict_of_sets(csv_file)

# delete sources from instances_dicts
if 'source' in instance_types_dicts:
    instance_types_dicts.pop('source')

# merge "interchange format" into "data format specification"
if 'interchange format' in instance_types_dicts:
    instance_types_dicts['data format specification'].update(instance_types_dicts['interchange format'])
    instance_types_dicts.pop('interchange format')

# merge all sets into one set
for instance_type in instance_types_dicts:
    instances += (instance_types_dicts[instance_type])

# drop all non-text instances
if np.nan in instances:
    instances.remove(np.nan)
# print(result)

paper_nlp_paths = {}
for file in os.listdir(paperspath):
    if file.endswith(".json"):
        paper_nlp_paths[file[:-5]] = os.path.join(paperspath, file)

papers = list(paper_nlp_paths.keys())

paper_instance_occurrence_matrix = count_occurrences(paper_nlp_paths, instances)


# free unneeded memory
del csv_file, file, instance_type, paperspath, paper_nlp_paths

Error parsing CSV file. Trying again with 'encoding=ISO-8859-1'


In [2]:
# Extract Paper Metadata
from bnw_tools.extract import util_zotero

def get_paper_metadata(papers, path):
    papers_metadata = {}

    bib_resources = util_zotero.BibResources(path)

    for paper in papers:
        for entry in bib_resources.entries:
            if hasattr(bib_resources.entries[entry], 'file') and paper in bib_resources.entries[entry].file:
                papers_metadata[paper] = bib_resources.entries[entry].get_dict()
                del bib_resources.entries[entry]
                break


    print(f"{len(papers_metadata)} out of {len(papers)} papers have metadata.")

    return papers_metadata

papers_metadata = get_paper_metadata(papers, 'G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR')

Found no new Zotero export at G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR:
There should be a folder called 'files'
We now have 1035 PDFs stored at G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR\00_PDFs
1024 out of 1028 papers have metadata.


In [3]:
def sort_instances(matrix, instances, instance_type_dict=None, dump = False):
    # total occurrences of each instance
    instance_occurrences = {}
    for i, instance in enumerate(instances):
        instance_occurrences[instance] = matrix[:, i].sum()
    instance_occurrences = {k: float(v) for k, v in sorted(instance_occurrences.items(), key=lambda item: item[1], reverse=True) if v > 0}

    if dump:
        with open('instance_occurrences.json', 'w', encoding="utf-8") as f:
            json.dump(instance_occurrences, f, ensure_ascii=False, indent=4)

    if instance_type_dict is not None:
        # Instances should be sorted by their type
        type_lists = [[] for _ in range(len(instance_type_dict))]
        for instance in instance_occurrences:
            for type_ID, instance_type in enumerate(instance_type_dict):
                if instance in instance_type_dict[instance_type]:
                    type_lists[type_ID].append(instance)
        instances = [item for sublist in type_lists for item in sublist]
    return instances

def remove_zeros(matrix, columns=True, rows=True, row_lists=None, column_lists=None):
    # remove all columns that are all zeros
    if columns:
        deleted_columns = np.all(matrix == 0, axis=0)
        matrix = matrix[:, ~np.all(matrix == 0, axis=0)]

    # remove all rows that are all zeros
    if rows:
        deleted_rows = np.all(matrix == 0, axis=1)
        matrix = matrix[~np.all(matrix == 0, axis=1)]

    
    return matrix, [deleted_columns, deleted_rows]

def update_instances(matrix, instances, instance_type_dict=None, dump = False):
    instances = sort_instances(matrix, instances, instance_type_dict, dump)
    matrix, deletions = remove_zeros(matrix)
    return matrix, instances, deletions

paper_instance_occurrence_matrix, instances, deletions = update_instances(paper_instance_occurrence_matrix, instances, instance_types_dicts)

def handle_deletions(input, deletions, rows = True):
    """
    input: list, dict or np.ndarray
    deletions: list of bools
    rows: if True, deletions[1] is used, else deletions[0]
    """
    delID = 1 if rows else 0

    if deletions[delID].any():
        # rows were deleted, in this case: papers
        if isinstance(input, list):
            input = [item for i, item in enumerate(input) if not deletions[delID][i]]
        elif isinstance(input, dict):
            input = {key: item for i, (key, item) in enumerate(input.items()) if not deletions[delID][i]}
        elif isinstance(input, np.ndarray):
            input = input[~deletions[delID]]
    return input

papers = handle_deletions(papers, deletions)
# free unneeded memory
del deletions

In [4]:
# Step 2: find occurrences of instances in full text of papers
GAP_TOO_LARGE_THRESHOLD = 1000

# get all text files
def get_paper_full_text(directory):
    paper_full_text = {}
    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".txt"):
                    file_path = os.path.join(folder_path, file)
                    paper_full_text[file[:-4]] = file_path
                    break

    return paper_full_text

paper_full_text = get_paper_full_text('G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/00_PDFs')

def find_pos_in_paper(papers, paper_full_text, instances, paper_instance_occurrence_matrix):
    # find all occurrences of instances in text files
    pos_in_paper = {}

    for paperID, paper in enumerate(papers):
        if paperID % 100 == 0:
            # print(f"Processing paper {paperID} of {len(papers)}")
            continue
        if paper in paper_full_text:
            # Full text of paper is available
            pos_in_paper[paper] = {}
            with open(paper_full_text[paper], 'r', encoding="utf8") as f:
                text = f.read().lower()
                for i, instance in enumerate(instances):
                    # if this instance is not in this document, move on.
                    if not paper_instance_occurrence_matrix[paperID][i]:
                        continue

                    pieces = instance.split(' ')
                    for piece in pieces:
                        piece = piece.lower()
                        if piece not in pos_in_paper[paper]:
                            pos_in_paper[paper][piece] = []
                            pos = 1
                            while pos > 0:
                                pos = text.find(piece, pos)
                                if pos != -1:
                                    pos_in_paper[paper][piece].append(pos)
                                    # make sure this instance cannot be found again
                                    pos += 1
                                    # Idea: store the sentence in which the instance was found
    return pos_in_paper

pos_in_paper = find_pos_in_paper(papers, paper_full_text, instances, paper_instance_occurrence_matrix)

# free unneeded memory
# del paper_full_text

In [5]:
# Step 3: find the gap between the pieces of an instance
import sys

def find_min_distance(lists):
    #TODO: currently, this does not consider stemmed words

    # Initialize pointers for each of the lists
    pointers = [0] * len(lists)
    min_distance = sys.maxsize
    for list in lists:
        if not list:
            # There are cases where e.g. "system integration" is not found in full text
            # This happens when NLP converts e.g. "integrated" to "integration"
            # example:
            # "Liu und Hu - 2013 - A reuse oriented representation model for capturin"
            # "system integration" -> "integration" is not found in the full text
            return -1
    while True:
        # Get the current elements from the lists
        current_elements = [lists[i][pointers[i]] for i in range(len(lists))]
        
        # Calculate the current distance
        current_min = min(current_elements)
        current_max = max(current_elements)
        current_distance = current_max - current_min
        
        # Update the minimum distance
        if current_distance < min_distance:
            min_distance = current_distance
            
        # Check if we can move forward in the list containing the minimum element
        min_index = current_elements.index(current_min)
        
        # If the pointer exceeds its list length, exit the loop
        for i in range(len(lists)):
            if pointers[i] < len(lists[i]) - 1:
                break
        if pointers[min_index] + 1 >= len(lists[min_index]):
            break
        
        # Otherwise, increment the pointer
        pointers[min_index] += 1
    
    return min_distance

# # Test the function with the given lists
# lists = [[1, 2, 3, 2, 1000], [50, 1001], [100, 1002, 10000]]
# print(find_min_distance(lists))

def split_string(string, delimiters = [" ", "-"]):
    for delimiter in delimiters:
        string = " ".join(string.split(delimiter))
    return string.split()

def find_instance_piece_gap(papers, paper_full_text, instances, paper_instance_occurrence_matrix, pos_in_paper):
    error_matrix = np.zeros(paper_instance_occurrence_matrix.shape, dtype=int)
    instance_piece_gap = {}
    for paperID, paper in enumerate(papers):
        if paperID % 100 == 0:
            # print(f"Processing paper {paperID} of {len(papers)}")
            continue
        if paper in paper_full_text:
            for i, instance in enumerate(instances):
                # if this instance is not in this document, move on.
                #TODO This does not work
                if not paper_instance_occurrence_matrix[paperID][i]:
                    continue

                pieces = split_string(instance)

                if len(pieces) > 1:
                    # print(f"Processing {instance} in {paper}")
                    candidate_postions = []
                    for piece in pieces:
                        candidate_postions.append(pos_in_paper[paper][piece])
                    min_distance = find_min_distance(candidate_postions)

                    # min_distance_nested = find_min_distance_nested(candidate_postions)
                    # print(f"{instance}: {min_distance} vs {min_distance_nested}")
                    # if min_distance != min_distance_nested:
                    #     print(f"Error: {min_distance} != {min_distance_nested}")

                    # Pieces to far apart are not counted
                    if min_distance > GAP_TOO_LARGE_THRESHOLD:
                        # print(f"Gap for {instance} in {paper} ({min_distance} > {GAP_TOO_LARGE_THRESHOLD})")
                        paper_instance_occurrence_matrix[paperID][i] = 0
                        error_matrix[paperID][i] = min_distance
                    
                    # Some pieces may not be found in the full text
                    if min_distance == -1:
                        # print(f"{instance} not found in {paper} at all")
                        paper_instance_occurrence_matrix[paperID][i] = 0
                        error_matrix[paperID][i] = min_distance
                        # for these, we do not store the gap                    
                        continue

                    if instance not in instance_piece_gap:
                        instance_piece_gap[instance] = {}
                    instance_piece_gap[instance][paper] = min_distance
    return instance_piece_gap, error_matrix

instance_piece_gap, error_matrix = find_instance_piece_gap(papers, paper_full_text, instances, paper_instance_occurrence_matrix, pos_in_paper)

error_matrix, has_error = remove_zeros(error_matrix)
error_papers = handle_deletions(papers, has_error)
error_instances = handle_deletions(instances, has_error, rows = False)

paper_instance_occurrence_matrix, instances, deletions = update_instances(paper_instance_occurrence_matrix, instances, instance_types_dicts)

papers = handle_deletions(papers, deletions)
pos_in_paper = handle_deletions(pos_in_paper, deletions)

instance_instance_co_occurrence_matrix = np.dot(paper_instance_occurrence_matrix.T, paper_instance_occurrence_matrix)

# free unneeded memory
del deletions, has_error

# Setup Complete

We now have:

| Variable                          | Type    | Size         | Comments |
|-----------------------------------|---------|--------------|----------|
| error_instances                   | list    | 165          | Comments |
| error_matrix                      | ndarray | (999, 165)   | Comments |
| error_papers                      | list    | 999          | Comments |
| gap_too_large_threshold           | int     | n.a.         | Comments |
| instance_piece_gap                | dict    | 151          | Comments |
| instance_types_dicts              | dict    | 5            | Comments |
| instances                         | list    | 315          | Comments |
| paper_full_text                   | dict    | 1029         | Comments |
| paper_instance_occurrence_matrix  | ndarray | (1003, 315)  | Comments |
| papers                            | list    | 1003         | Comments |
| pos_in_paper                      | dict    | 1003         | Comments |

Consisting of:
* The paper_instance_occurrence_matrix, binary listing if a term (instance) is present in a paper
  * papers x instances
* The error_matrix, of all instances that were dropped from the paper_instance_occurrence_matrix
  * error_papers x error_instances

And some leftover variables:
* instance_types_dicts, listing all instance types ("process", "software", ...) and their respective instance sets ("Curation", "Knowledge Work", ...)
* paper_full_text, containing each papers full text
  * pos_in_paper, listing for each paper: for each instance: each position of that instance in that papers full text.
* instance_piece_gap, a dict listing all instances made up from compound words (e.g. "Knowledge Work", and their minimum distance in each papers full text)
  * gap_too_large_threshold, defining how far appart a finding of "Knowledge" and "Work" would qualify as "Knowledge Work"

In [6]:
# compare proximity of all instances with one antoher
def calculate_proximity_matrix(pos_in_paper, instances):

    # create a np zeros matrix of size instances x instances
    instance_instance_proximity_matrix = np.zeros((len(instances), len(instances)), dtype=float)

    mode = "sqrt"
    # alternatives are:
    # "sqrt" - 1 / (square root of the distance)
    # "linear" - 1 / distance
    # "binary" - 1 if distance < MAX_GAP_THRESHOLD, 0 otherwise
    # "log" - 1 / log(distance) 

    for paper in pos_in_paper:
        for id1, instance1 in enumerate(instances):
            for id2, instance2 in enumerate(instances):
                if instance1 != instance2:
                    if instance1 not in pos_in_paper[paper]:
                        continue
                    if instance2 not in pos_in_paper[paper]:
                        continue
                    positions1 = pos_in_paper[paper][instance1]
                    positions2 = pos_in_paper[paper][instance2]
                    if positions1 and positions2:
                        distance = find_min_distance([positions1, positions2])
                        if distance < 0:
                            # print(f"Error: {instance1} and {instance2} not found in {paper}")
                            continue
                        
                        result = 0.0
                        if distance == 0:
                            result = 1
                        elif mode == "sqrt":
                            result = 1 / np.sqrt(distance)
                        elif mode == "linear":
                            result = 1 / distance
                        elif mode == "binary":
                            result = 1 if distance < GAP_TOO_LARGE_THRESHOLD else 0
                        elif mode == "log":
                            result = 1 / np.log(distance)
                        else:
                            print("Error: unknown mode")
                            break
                        if result > 0.0:
                            instance_instance_proximity_matrix[id1][id2] += result

    #TODO rest doesnt seem to work, short fix implemented:
    # create a copy of labels that only contains instances that are in the proximity matrix

    instance_instance_proximity_matrix, deletions = remove_zeros(instance_instance_proximity_matrix)
    proximity_instances = handle_deletions(instances, deletions, rows=False)
    
    return instance_instance_proximity_matrix, proximity_instances

instance_instance_proximity_matrix, proximity_instances = calculate_proximity_matrix(pos_in_paper, instances)

# ToDo for quantitative analysis
* tf-idf only on terms
* arrange the papers on a timeline and identify the flow of:
  * Processes
  * File formats
  * software
  * ...
  * Compare this to goolge trends

In [7]:
from mlxtend.frequent_patterns import apriori

# AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'
dataframe = pd.DataFrame(paper_instance_occurrence_matrix, columns=instances).astype(bool)

# for each process:
# create one res

res = apriori(dataframe, min_support=0.4, use_colnames=True, max_len=2)

# visualize res
res = res.sort_values(by='support', ascending=False)
res = res.reset_index(drop=True)
# res

In [8]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(res)
# sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)
# rules = rules.sort_values(by='lift', ascending=False) # (propably most important)
# rules = rules.sort_values(by='leverage', ascending=False)
# export rules to csv

In [9]:
def identify_cross_type_rules(rules):
    cross_type = [False] * len(rules)

    for i, antecentent in enumerate(rules.antecedents):
        antecentent, = antecentent
        consequent, = rules.iloc[i].consequents
        type1, type2 = None, None
        for type in instance_types_dicts:
            if antecentent in instance_types_dicts[type]:
                type1 = type
            if consequent in instance_types_dicts[type]:
                type2 = type
            if type1 and type2:
                break
        if type1 != type2:
            cross_type[i] = True
            # print(rules.iloc[i])

    # create a copy for all rules that are cross type
    cross_type_rules = rules[cross_type].copy()
    return cross_type_rules

cross_type_rules = identify_cross_type_rules(rules)

# Output

## Setup

In [10]:
for_git = True
visualize = True

if for_git:
    visualize = False

## Functions

In [11]:
# represent a dict
import csv
import os
from itables import init_notebook_mode, show

# better represent dataframes
if not for_git:
    init_notebook_mode(all_interactive=True)

def get_output_path(path = None):
    if path is not None:
        return path
    else:
        # return 'C:/workspace/borgnetzwerk/tools/scripts/SLR/'
        return ''
    
def process_dict(input_dict, filename="some_dict", path=None):
    # convert all sets to lists
    for key in input_dict:
        if isinstance(input_dict[key], set):
            input_dict[key] = list(input_dict[key])

    if path is None:
        path = get_output_path()

    with open(filename + '.json', 'w', encoding="utf-8") as f:
        json.dump(input_dict, f, ensure_ascii=False, indent=4)

        
    container = [
        ["Instance", "Min", "Max", "Mean", "Median", "Std"]
    ]

    for instance, papers in instance_piece_gap.items():

        # print(f"Instance: {instance}")
        gaps = papers.values()
        # generate all kinds of statistical values
        min_gap = min(gaps)
        max_gap = max(gaps)
        mean_gap = sum(gaps) / len(gaps)
        median_gap = np.median(list(gaps))
        std_gap = np.std(list(gaps))
        container.append([instance, min_gap, max_gap, mean_gap, median_gap, std_gap])

    filepath = os.path.join(path, filename)

    # write to csv
    with open(filepath + ".csv", 'w', newline='') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerows(container)   

def process_dataframe(input_df, name = "some_df", path=None):
    if path is None:
        path = get_output_path()
    filepath = os.path.join(path, name)
    input_df.to_csv(filepath + '.csv', sep=';', decimal=',')
    show(input_df)

In [12]:
# visualize co-occurrences
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import math

def visualize_matrix(matrix: np.ndarray, rows: list[str], columns: list[str] = None, name: str = 'some_matrix', format = '.png') -> None:
    """
    Visualizes a matrix as a heatmap.
    matrix: The matrix to visualize
    rows: The labels for the rows
    columns: The labels for the columns
    name: The name of the file to save
    format: The format of the file to save (default: '.png', also accepts '.svg' and '.pdf', also accepts a list of formats)
    """
    if columns is None:
        columns = rows

    ## Calculate the maximum size of the plot
    dpi = 300
    max_pixel = 2**16  # Maximum size in any direction
    max_size = max_pixel / dpi  # Maximum size in any direction
    max_size_total = max_size * max_size # Maximum size in total
    max_size_total *= 0.05 # produce smaller files

    # Experience value of space required per cell
    factor = 0.18
    size_x: float = 2 + len(columns) * factor
    size_y: float = 2 + len(rows) * factor

    while size_x * size_y < max_size_total and dpi < 600:
        dpi /= 0.95 
        max_size_total *= 0.95

    if dpi > 600:
        dpi = 600

    while size_x * size_y > max_size_total:
        dpi *= 0.95 
        max_size_total /= 0.95

    fig, ax = plt.subplots(figsize=(size_x, size_y), dpi=dpi)

    cax = ax.matshow(matrix, cmap='viridis')

    # use labels from instance_occurrences
    ax.set_xticks(range(len(columns)))
    ax.set_xticklabels(list(columns), fontsize=10, rotation=90)
    ax.set_yticks(range(len(rows)))
    ax.set_yticklabels(list(rows), fontsize=10)

    # # adjust the spacing between the labels
    # plt.gca().tick_params(axis='x', which='major', pad=15)
    # plt.gca().tick_params(axis='y', which='major', pad=15)

    # show the number of co-occurrences in each cell, if greater than 0
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            if matrix[i, j] == 0:
                continue
            # if co_occurrences[i, j] > 100:
            #     continue
            plt.text(j, i, round(matrix[i, j], 2), ha='center', va='center', color='white', fontsize=4)

    # plt.show()
    fig.tight_layout()
    if isinstance(format, list):
        for f in format:
            if f[0] != '.':
                f = '.' + f
            fig.savefig(name + f)
    else:
        if format[0] != '.':
            format = '.' + format
        fig.savefig(name + format)

def visualize_matrix_graph(matrix, instances, instance_types_dicts, name='some_matrix_graph', path=None):
    path = get_output_path(path)

    SEED = 17
    K_SPRRING = 18

    scale = len(instances) * .12
    # Create a new figure
    x = scale / 10 * 16
    y = scale / 10 * 9
    fig = plt.figure(figsize=(x, y))

    # normalize the proximity matrix
    matrix = matrix / matrix.max()

    mode = "sqrt"

    # alternatives are:
    # "linear" - take proximity as is
    # "sqrt" - sqrt(proximity)
    # "log" - log(proximity)
    if mode == "log":
        nodesize_map = [np.log(matrix[:, i].sum() + 1) for i in range(len(instances))]
    elif mode == "sqrt":
        nodesize_map = [np.sqrt(matrix[:, i].sum()) for i in range(len(instances))]
    elif mode == "linear":
        nodesize_map = [matrix[:, i].sum()for i in range(len(instances))]
    else:
        nodesize_map = [matrix[:, i].sum() for i in range(len(instances))]
        
    # print(max(nodesize_map))
    # print(min(nodesize_map))

    nodesize_map = np.array(nodesize_map) / max(nodesize_map) * 1000

    # print(max(nodesize_map))
    # print(min(nodesize_map))

    # take the root of the proximity matrix
    while np.min(matrix[np.nonzero(matrix)]) < 1/10:
        matrix = np.sqrt(matrix)

    # Create a graph from the proximity matrix
    G = nx.from_numpy_array(matrix)

    # Specify the layout
    pos = nx.spring_layout(G, seed=SEED, k=K_SPRRING/math.sqrt(G.order()))  # Seed for reproducibility

    color_map = []

    color = {
        "process": "#1f77b4",  # muted blue
        "software": "#ff7f0e",  # safety orange
        "data item": "#2ca02c",  # cooked asparagus green
        "data model": "#d62728",  # brick red
        "data format specification": "#9467bd",  # muted purple
        "interchange format": "#8c564b",  # chestnut brown
        # "source": "#e377c2",  # raspberry yogurt pink
    }

    for instance in instances:
        added = False
        for instance_type in instance_types_dicts:
            if instance in instance_types_dicts[instance_type]:
                color_map.append(color[instance_type])
                added = True
                break
        if not added:
            color_map.append("grey")

    # Draw the graph
    options = {
        "edge_color": "grey",
        "linewidths": 0.5,
        "width": 0.5,
        "with_labels": True,  # This will add labels to the nodes
        "labels": {i: label for i, label in enumerate(instances)},
        "node_color": color_map,
        "node_size": nodesize_map,
        # "edge_color": "white",
        # "alpha": 0.9,
    }

    # print(nx.is_weighted(G))


    # nx.set_edge_attributes(G, values = 1, name = 'weight')

    nx.draw(G, pos, **options, ax=fig.add_subplot(111))

    # Make the graph more spacious
    fig.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.9)

    # Create a patch for each color
    patches = [mpatches.Patch(color=color[key], label=key) for key in color]

    # Add the legend to the graph
    plt.legend(handles=patches, loc='upper right', fontsize='x-large')


    plt.show()

    # save plot to file
    fig.savefig(path + name + '.png')
    fig.savefig(path + name + '.svg')

    # nx.get_edge_attributes(G, 'weight')

def sankey(matrix, instances, instance_types_dicts, name='some_sankey', path = None):
    #TODO: Implement a method to create one graph per Process
    path = get_output_path(path)
    # Convert the proximity matrix into a list of source nodes, target nodes, and values
    sources = []
    targets = []
    values = []

    x_pos=[0] * len(instances)
    y_pos=[0] * len(instances)
    color_map=[0] * len(instances)

    max_types = len(instance_types_dicts)
    type_positions = [0.1 + (i / max_types) * 0.8 for i in range(max_types)]

    color = {
        "process": "#1f77b4",  # muted blue
        "software": "#ff7f0e",  # safety orange
        "data item": "#2ca02c",  # cooked asparagus green
        "data model": "#d62728",  # brick red
        "data format specification": "#9467bd",  # muted purple
        "interchange format": "#8c564b",  # chestnut brown
        # "source": "#e377c2",  # raspberry yogurt pink
    }
    color = list(color.values())

    space = {}

    for i in range(matrix.shape[0]):
        source_type = None

        for j in range(matrix.shape[1]):
            target_type = None
            
            for type_depth, type in enumerate(instance_types_dicts):
                if instances[i] in instance_types_dicts[type]:
                    source_type = type_depth
                if proximity_instances[j] in instance_types_dicts[type]:
                    target_type = type_depth

            # only keep directly forward moving connections
            if target_type - source_type != 1:
                continue

            # only keep forward moving connections
            if target_type - source_type <= 0:
                continue

            if source_type not in space:
                space[source_type] = {}
            if i not in space[source_type]:
                space[source_type][i] = 0
            space[source_type][i] += matrix[i][j]
            
            if target_type not in space:
                space[target_type] = {}
            if j not in space[target_type]:
                space[target_type][j] = 0
            space[target_type][j] += matrix[i][j]

            x_pos[i] = type_positions[source_type]
            x_pos[j] = type_positions[target_type]
            color_map[i] = color[source_type]
            color_map[j] = color[target_type]
            if matrix[i][j] > 0.0:  # Ignore zero values
                sources.append(i)
                targets.append(j)
                values.append(matrix[i][j])

    for type in space:
        sum_values = sum(space[type].values())
        space[type] = {k: v/sum_values for k, v in sorted(space[type].items(), key=lambda item: item[1], reverse=True)}

    # assign each instance a proper y position
    for type in space:
        bottom = 0.1
        for i, instance in enumerate(space[type]):
            y_pos[instance] = bottom
            bottom += space[type][instance] * 0.8

    nodes = dict(
        # pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=proximity_instances,  # Use your labels here
        color=color_map,
        x=x_pos,
        y=y_pos,
        align="right",
    )

    # Create a Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=nodes,
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])


    fig.update_layout(width=1920, height=1080)


    fig.update_layout(title_text="Sankey Diagram", font_size=10)
    # fig.show()
    fig.write_image(path + name + '.png')
    fig.write_image(path + name + '.svg')
    fig.write_html(path + name + 'sankey.html')

# Represent a matrix
def matrix_processing(matrix, rows, columns=None, name = 'some_matrix', visualize = True, path = None, instance_types_dicts = None):
    if columns is None:
        columns = rows
    if path is None:
        path = get_output_path()
    df = pd.DataFrame(matrix, columns=columns, index=rows)
    df.to_csv(name + '.csv', sep=';')
    if visualize:
        if instance_types_dicts:
            sankey(matrix, rows, instance_types_dicts, name)
            visualize_matrix_graph(matrix, rows, instance_types_dicts, name)
        visualize_matrix(matrix, rows, columns, name)

## Files

In [13]:
# All Dicts: instance_types_dicts, papers_metadata, instance_piece_gap
process_dict(instance_types_dicts, 'instance_types_dicts')
process_dict(papers_metadata, 'papers_metadata')
process_dict(instance_piece_gap, 'instance_piece_gaps')

### Rules

In [14]:
process_dataframe(rules, 'rules')
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
Loading ITables v2.1.1 from the internet... (need help?),,,,,,,,,,


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
75,(knowledge based engineering),(external web page),0.634731,0.816367,0.634731,1.000000,1.224939,0.116557,inf,0.502732
22,(object definitions),(external web page),0.749501,0.816367,0.749501,1.000000,1.224939,0.137633,inf,0.733068
47,(analysis),(pattern analyis),0.691617,0.692615,0.691617,1.000000,1.443804,0.212593,inf,0.996764
120,(descriptive information generation),(system identification),0.575848,0.925150,0.575848,1.000000,1.080906,0.043102,inf,0.176471
48,(pattern analyis),(analysis),0.692615,0.691617,0.691617,0.998559,1.443804,0.212593,214.017964,1.000000
...,...,...,...,...,...,...,...,...,...,...
114,(calculating),(training),0.725549,0.729541,0.583832,0.804677,1.102991,0.054515,1.384674,0.340221
166,(sas),(yawl),0.629741,0.715569,0.505988,0.803487,1.122864,0.055365,1.447387,0.295523
112,(training),(object definitions),0.729541,0.749501,0.584830,0.801642,1.069567,0.038039,1.262860,0.240488
165,(jpg),(training),0.632735,0.729541,0.506986,0.801262,1.098310,0.045380,1.360881,0.243720


In [15]:
process_dataframe(cross_type_rules, 'cross_type_rules')
cross_type_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
Loading ITables v2.1.1 from the internet... (need help?),,,,,,,,,,


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
75,(knowledge based engineering),(external web page),0.634731,0.816367,0.634731,1.000000,1.224939,0.116557,inf,0.502732
83,(swf software package rce),(physical sensing),0.631737,0.918164,0.624750,0.988942,1.077086,0.044713,7.400342,0.194343
259,(pykechain),(physical sensing),0.416168,0.918164,0.408184,0.980815,1.068236,0.026074,4.265719,0.109410
136,(sumo),(system identification),0.556886,0.925150,0.545908,0.980287,1.059598,0.030705,3.796952,0.126933
183,(difficulty),(system identification),0.495010,0.925150,0.481038,0.971774,1.050397,0.023080,2.651839,0.095009
...,...,...,...,...,...,...,...,...,...,...
233,(outlook),(test report),0.539920,0.734531,0.435130,0.805915,1.097183,0.038542,1.367798,0.192521
261,(knowledge interaction),(object definitions),0.505988,0.749501,0.407186,0.804734,1.073693,0.027947,1.282859,0.138933
166,(sas),(yawl),0.629741,0.715569,0.505988,0.803487,1.122864,0.055365,1.447387,0.295523
112,(training),(object definitions),0.729541,0.749501,0.584830,0.801642,1.069567,0.038039,1.262860,0.240488


### Paper x Instance

In [16]:
matrix_processing(paper_instance_occurrence_matrix, rows=papers, columns=instances, name='paper_instance_occurrence_matrix', visualize=visualize)

In [17]:
matrix_processing(error_matrix, rows=error_papers, columns=error_instances, name='error_matrix', visualize=visualize)

### Instance x Instance

In [18]:
matrix_processing(instance_instance_co_occurrence_matrix, rows=instances, columns=instances, name='instance_instance_co_occurrence_matrix', visualize=visualize)

In [19]:
matrix_processing(instance_instance_proximity_matrix, rows=proximity_instances, columns=proximity_instances, name='proximity', visualize=visualize)

# Later
* Word Embeedding
  * Find out, that jpeg and png are similar

# Much Later
Maybe, just maybe, make 
* Paper classes
* Subclasses of paper classes
* model which process is a subprocess of another process