# Load and modify data

In [1]:
# Setup

for_git = True
visualize = True
CSV_SEPARATOR = ';'
CSV_DECIMAL = ','

if for_git:
    visualize = False
    CSV_SEPARATOR = ','
    CSV_DECIMAL = '.'
    
def get_output_path(path = None):
    if path is not None:
        return path
    else:
        # return 'C:/workspace/borgnetzwerk/tools/scripts/SLR/'
        return 'output/'
    


In [2]:
# Step 1: find occurrences of instances in bag of words of papers
import pandas as pd
import os
import json
import numpy as np


def csv_to_dict_of_sets(csv_file):
    dict_of_sets = {}
    # try:
    #     df = pd.read_csv(csv_file)
    # except pd.errors.ParserError:
    #     print("Error parsing CSV file. Trying again with 'error_bad_lines=False'")
    # TODO: Specify modular separator and decimal here as well 
    sep = ';'
    try:
        df = pd.read_csv(csv_file, on_bad_lines='warn', delimiter=sep,  encoding="utf8")
    except:
        print("Error parsing CSV file. Trying again with 'encoding=ISO-8859-1'")
        df = pd.read_csv(csv_file, on_bad_lines='warn', delimiter=sep, encoding='ISO-8859-1')
    for column in df.columns:
        dict_of_sets[column] = set(df[column].str.lower())
    # saved_column = df['process'] #you can also use df['column_name']
    # delete all that exists in two or more columns
    for key in dict_of_sets:
        for other_key in dict_of_sets:
            if key != other_key:
                dict_of_sets[key] = dict_of_sets[key].difference(dict_of_sets[other_key])
    return dict_of_sets

def count_occurrences(papers, instances):
    occurrences = np.zeros((len(papers), len(instances)), dtype=int)

    for p, paperpath in enumerate(papers.values()):
        with open(paperpath, 'r', encoding="utf8") as f:
            paper = json.load(f)
            for i, instance in enumerate(instances):
                present = True
                pieces = instance.split(' ')
                for piece in pieces:
                    if piece.lower() not in paper['bag_of_words']:
                        present = False
                        break
                    
                # if instance == "system integration":
                #     if "Liu und Hu - 2013 - A reuse oriented representation model for capturin" in paperpath:
                #         print(present)
                if present:
                    occurrences[p][i] = 1
    return occurrences

# ---------------------- Variables ----------------------

## instances: A list of all instances, regardless of their type
# first all type 1, then all type 2, etc.
# if possible, instance sare ordered by their occurrence
instances = []

## instances_dicts: A dictionary of all different types (columns) of instances
#
# types:
#  - process
#  - software
#  - data item
#  - data model
#  - data format specification
#  - interchange format
#  - source
#
# instances_dicts['process']: A set of all instances of the type 'process'
#
instance_types_dicts = {}

## paper_nlp_dict: A dictionary of all papers and their NLP data (as dict)
paper_nlp_paths = {}

## occurrences: A matrix of binary occurrences of instances in papers
#
# rows: papers
# columns: instances
# cells: 1 if instance is present in paper, 0 otherwise
#
paper_instance_occurrence_matrix = np.zeros((), dtype=int)

csv_file = 'C:/workspace/borgnetzwerk/tools/scripts/SLR/data.csv'
paperspath = 'G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/02_nlp'

# ---------------------- Main ----------------------

# Usage example

instance_types_dicts = csv_to_dict_of_sets(csv_file)

# delete sources from instances_dicts
if 'source' in instance_types_dicts:
    instance_types_dicts.pop('source')

# merge "interchange format" into "data format specification"
if 'interchange format' in instance_types_dicts:
    instance_types_dicts['data format specification'].update(instance_types_dicts['interchange format'])
    instance_types_dicts.pop('interchange format')

# merge all sets into one set
for instance_type in instance_types_dicts:
    instances += (instance_types_dicts[instance_type])

# drop all non-text instances
if np.nan in instances:
    instances.remove(np.nan)
# print(result)

paper_nlp_paths = {}
for file in os.listdir(paperspath):
    if file.endswith(".json"):
        paper_nlp_paths[file[:-5]] = os.path.join(paperspath, file)

papers = list(paper_nlp_paths.keys())

paper_instance_occurrence_matrix = count_occurrences(paper_nlp_paths, instances)

# free unneeded memory
del csv_file, file, instance_type, paperspath, paper_nlp_paths

Error parsing CSV file. Trying again with 'encoding=ISO-8859-1'


In [3]:
# debug
# 
ike_index = instances.index("ikewiki")
sum_ikewiki = np.sum(paper_instance_occurrence_matrix[:,ike_index])
sum_ikewiki

1

In [4]:
def sort_instances(matrix, instances, instance_type_dict=None):
    # total occurrences of each instance
    instance_occurrences = {}
    
    for i, instance in enumerate(instances):
        instance_occurrences[instance] = matrix[:, i].sum()
    sorted_instances = {k: float(v) for k, v in sorted(instance_occurrences.items(), key=lambda item: item[1], reverse=True) if v > 0}
    filepath = get_output_path() + 'instance_occurrences'
    with open(filepath + '.json', 'w', encoding="utf-8") as f:
        json.dump(sorted_instances, f, ensure_ascii=False, indent=4)

    sorted_instance_list = list(sorted_instances.keys())


    if instance_type_dict is not None:
        # Instances should be sorted by their type
        type_lists = [[] for _ in range(len(instance_type_dict))]
        for instance in sorted_instance_list:
            for type_ID, instance_type in enumerate(instance_type_dict):
                if instance in instance_type_dict[instance_type]:
                    type_lists[type_ID].append(instance)
        type_sorted_instances = [item for sublist in type_lists for item in sublist]

    new_order = [0] * len(sorted_instance_list)
    for i, instance in enumerate(type_sorted_instances):
        new_order[i] = instances.index(instance)

    return type_sorted_instances, new_order

def remove_zeros(matrix, columns=True, rows=True, row_lists=None, column_lists=None):
    # remove all columns that are all zeros
    if columns:
        deleted_columns = np.all(matrix == 0, axis=0)
        matrix = matrix[:, ~np.all(matrix == 0, axis=0)]

    # remove all rows that are all zeros
    if rows:
        deleted_rows = np.all(matrix == 0, axis=1)
        matrix = matrix[~np.all(matrix == 0, axis=1)]

    
    return matrix, [deleted_columns, deleted_rows]

def update_instances(matrix, instances, instance_type_dict=None):
    instances, new_order = sort_instances(matrix, instances, instance_type_dict)

    new_order = np.array(new_order)
    matrix = matrix[:, new_order]
    
    matrix, deletions = remove_zeros(matrix)
    return matrix, instances, deletions

paper_instance_occurrence_matrix, instances, deletions = update_instances(paper_instance_occurrence_matrix, instances, instance_types_dicts)

def handle_deletions(input, deletions, rows = True):
    """
    input: list, dict or np.ndarray
    deletions: list of bools
    rows: if True, deletions[1] is used, else deletions[0]
    """
    delID = 1 if rows else 0

    if deletions[delID].any():
        # rows were deleted, in this case: papers
        if isinstance(input, list):
            input = [item for i, item in enumerate(input) if not deletions[delID][i]]
        elif isinstance(input, dict):
            input = {key: item for i, (key, item) in enumerate(input.items()) if not deletions[delID][i]}
        elif isinstance(input, np.ndarray):
            input = input[~deletions[delID]]
    return input

papers = handle_deletions(papers, deletions)
# free unneeded memory
del deletions

In [5]:
# debug
# get the sum of "ikewiki" in the occurrences
ike_index = instances.index("ikewiki")
sum_ikewiki = np.sum(paper_instance_occurrence_matrix[:,ike_index])
sum_ikewiki

1

In [6]:
# Step 2: find occurrences of instances in full text of papers
GAP_TOO_LARGE_THRESHOLD = 1000

# get all text files
def get_paper_full_text(directory):
    paper_full_text = {}
    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".txt"):
                    file_path = os.path.join(folder_path, file)
                    paper_full_text[file[:-4]] = file_path
                    break

    return paper_full_text

paper_full_text = get_paper_full_text('G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/00_PDFs')

def find_pos_in_paper(papers, paper_full_text, instances, paper_instance_occurrence_matrix):
    # find all occurrences of instances in text files
    pos_in_paper = {}

    for paperID, paper in enumerate(papers):
        if paperID % 100 == 0:
            # print(f"Processing paper {paperID} of {len(papers)}")
            continue
        if paper in paper_full_text:
            # Full text of paper is available
            pos_in_paper[paper] = {}
            with open(paper_full_text[paper], 'r', encoding="utf8") as f:
                text = f.read().lower()
                for i, instance in enumerate(instances):
                    # if this instance is not in this document, move on.
                    if not paper_instance_occurrence_matrix[paperID][i]:
                        continue

                    pieces = instance.split(' ')
                    for piece in pieces:
                        piece = piece.lower()
                        if piece not in pos_in_paper[paper]:
                            pos_in_paper[paper][piece] = []
                            pos = 1
                            while pos > 0:
                                pos = text.find(piece, pos)
                                if pos != -1:
                                    pos_in_paper[paper][piece].append(pos)
                                    # make sure this instance cannot be found again
                                    pos += 1
                                    # Idea: store the sentence in which the instance was found
    return pos_in_paper

pos_in_paper = find_pos_in_paper(papers, paper_full_text, instances, paper_instance_occurrence_matrix)

# free unneeded memory
# del paper_full_text

In [7]:
# Step 3: find the gap between the pieces of an instance
import sys

def find_min_distance(lists):
    #TODO: currently, this does not consider stemmed words

    # Initialize pointers for each of the lists
    pointers = [0] * len(lists)
    min_distance = sys.maxsize
    for list in lists:
        if not list:
            # There are cases where e.g. "system integration" is not found in full text
            # This happens when NLP converts e.g. "integrated" to "integration"
            # example:
            # "Liu und Hu - 2013 - A reuse oriented representation model for capturin"
            # "system integration" -> "integration" is not found in the full text
            return -1
    while True:
        # Get the current elements from the lists
        current_elements = [lists[i][pointers[i]] for i in range(len(lists))]
        
        # Calculate the current distance
        current_min = min(current_elements)
        current_max = max(current_elements)
        current_distance = current_max - current_min
        
        # Update the minimum distance
        if current_distance < min_distance:
            min_distance = current_distance
            
        # Check if we can move forward in the list containing the minimum element
        min_index = current_elements.index(current_min)
        
        # If the pointer exceeds its list length, exit the loop
        for i in range(len(lists)):
            if pointers[i] < len(lists[i]) - 1:
                break
        if pointers[min_index] + 1 >= len(lists[min_index]):
            break
        
        # Otherwise, increment the pointer
        pointers[min_index] += 1
    
    return min_distance

# # Test the function with the given lists
# lists = [[1, 2, 3, 2, 1000], [50, 1001], [100, 1002, 10000]]
# print(find_min_distance(lists))

def split_string(string, delimiters = [" ", "-"]):
    for delimiter in delimiters:
        string = " ".join(string.split(delimiter))
    return string.split()

def find_instance_piece_gap(papers, paper_full_text, instances, paper_instance_occurrence_matrix, pos_in_paper):
    error_matrix = np.zeros(paper_instance_occurrence_matrix.shape, dtype=float)
    instance_piece_gap = {}
    for paperID, paper in enumerate(papers):
        if paperID % 100 == 0:
            # print(f"Processing paper {paperID} of {len(papers)}")
            continue
        if paper in paper_full_text:
            for i, instance in enumerate(instances):
                # if this instance is not in this document, move on.
                #TODO This does not work
                if not paper_instance_occurrence_matrix[paperID][i]:
                    continue

                pieces = split_string(instance)

                if len(pieces) > 1:
                    # print(f"Processing {instance} in {paper}")
                    candidate_postions = []
                    for piece in pieces:
                        candidate_postions.append(pos_in_paper[paper][piece])
                    min_distance = find_min_distance(candidate_postions)

                    # min_distance_nested = find_min_distance_nested(candidate_postions)
                    # print(f"{instance}: {min_distance} vs {min_distance_nested}")
                    # if min_distance != min_distance_nested:
                    #     print(f"Error: {min_distance} != {min_distance_nested}")

                    # Pieces to far apart are not counted
                    if min_distance > GAP_TOO_LARGE_THRESHOLD:
                        # print(f"Gap for {instance} in {paper} ({min_distance} > {GAP_TOO_LARGE_THRESHOLD})")
                        paper_instance_occurrence_matrix[paperID][i] = 0
                        # get log base 10 of min distance
                        error_matrix[paperID][i] = round(np.log10(min_distance), 1)
                    
                    # Some pieces may not be found in the full text
                    if min_distance == -1:
                        # print(f"{instance} not found in {paper} at all")
                        paper_instance_occurrence_matrix[paperID][i] = 0
                        error_matrix[paperID][i] = min_distance
                        # for these, we do not store the gap                    
                        continue

                    if instance not in instance_piece_gap:
                        instance_piece_gap[instance] = {}
                    instance_piece_gap[instance][paper] = min_distance
    return instance_piece_gap, error_matrix

instance_piece_gap, error_matrix = find_instance_piece_gap(papers, paper_full_text, instances, paper_instance_occurrence_matrix, pos_in_paper)

error_matrix, has_error = remove_zeros(error_matrix)
error_papers = handle_deletions(papers, has_error)
error_instances = handle_deletions(instances, has_error, rows = False)

paper_instance_occurrence_matrix, instances, deletions = update_instances(paper_instance_occurrence_matrix, instances, instance_types_dicts)

papers = handle_deletions(papers, deletions)
pos_in_paper = handle_deletions(pos_in_paper, deletions)

instance_instance_co_occurrence_matrix = np.dot(paper_instance_occurrence_matrix.T, paper_instance_occurrence_matrix)

# free unneeded memory
del deletions, has_error

In [8]:
# Extract Paper Metadata
from bnw_tools.extract import util_zotero

def get_paper_metadata(papers, path):
    papers_metadata = {}

    bib_resources = util_zotero.BibResources(path)

    for paper in papers:
        for entry in bib_resources.entries:
            if hasattr(bib_resources.entries[entry], 'file') and paper in bib_resources.entries[entry].file:
                papers_metadata[paper] = bib_resources.entries[entry].get_dict()
                del bib_resources.entries[entry]
                break


    print(f"{len(papers_metadata)} out of {len(papers)} papers have metadata.")

    return papers_metadata

papers_metadata = get_paper_metadata(papers, 'G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR')

Found no new Zotero export at G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR:
There should be a folder called 'files'
We now have 1035 PDFs stored at G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR\00_PDFs
1001 out of 1005 papers have metadata.


In [9]:
# Create year_paper_occurrence_matrix
def create_year_paper_occurrence_matrix(papers_metadata, paper_instance_occurrence_matrix, papers, is_error_matrix=False):
    year_papers = {}

    for paper in papers_metadata:
        if 'year' in papers_metadata[paper]:
            year = int(papers_metadata[paper]['year'])
            if year not in year_papers:
                year_papers[year] = []
            year_papers[year].append(paper)


    earliest = min(year_papers)
    latest = max(year_papers)
    span = latest-earliest+1

    for year in range(earliest, latest):
        if year not in year_papers:
            year_papers[year] = []

    year_papers = {k: v for k, v in sorted(year_papers.items(), key=lambda item: item[0])}

    if is_error_matrix:
        # convert any value != 0 to 1
        paper_instance_occurrence_matrix = np.where(paper_instance_occurrence_matrix != 0, 1, 0)

    # create a year_instance_occurence matrix from the paper_instance_occurrence_matrix
    year_instance_occurrence_matrix = np.zeros((span, paper_instance_occurrence_matrix.shape[1]), dtype=int)
    for yearID, year in enumerate(year_papers):
        for paper in year_papers[year]:
            if paper in papers:
                paperID = papers.index(paper)
                year_instance_occurrence_matrix[yearID] += paper_instance_occurrence_matrix[paperID]
    
    return year_instance_occurrence_matrix, year_papers

year_instance_occurrence_matrix, year_papers = create_year_paper_occurrence_matrix(papers_metadata, paper_instance_occurrence_matrix, papers)

# Setup Complete

We now have:

| Variable                          | Type    | Size         | Comments |
|-----------------------------------|---------|--------------|----------|
| error_instances                   | list    | 165          | Comments |
| error_matrix                      | ndarray | (999, 165)   | Comments |
| error_papers                      | list    | 999          | Comments |
| gap_too_large_threshold           | int     | n.a.         | Comments |
| instance_piece_gap                | dict    | 151          | Comments |
| instance_types_dicts              | dict    | 5            | Comments |
| instances                         | list    | 315          | Comments |
| paper_full_text                   | dict    | 1029         | Comments |
| paper_instance_occurrence_matrix  | ndarray | (1003, 315)  | Comments |
| papers                            | list    | 1003         | Comments |
| pos_in_paper                      | dict    | 1003         | Comments |

Consisting of:
* The paper_instance_occurrence_matrix, binary listing if a term (instance) is present in a paper
  * papers x instances
* The error_matrix, of all instances that were dropped from the paper_instance_occurrence_matrix
  * error_papers x error_instances

And some leftover variables:
* instance_types_dicts, listing all instance types ("process", "software", ...) and their respective instance sets ("Curation", "Knowledge Work", ...)
* paper_full_text, containing each papers full text
  * pos_in_paper, listing for each paper: for each instance: each position of that instance in that papers full text.
* instance_piece_gap, a dict listing all instances made up from compound words (e.g. "Knowledge Work", and their minimum distance in each papers full text)
  * gap_too_large_threshold, defining how far appart a finding of "Knowledge" and "Work" would qualify as "Knowledge Work"

In [10]:
# compare proximity of all instances with one antoher
def calculate_proximity_matrix(pos_in_paper, instances):

    # create a np zeros matrix of size instances x instances
    instance_instance_proximity_matrix = np.zeros((len(instances), len(instances)), dtype=float)

    mode = "sqrt"
    # alternatives are:
    # "sqrt" - 1 / (square root of the distance)
    # "linear" - 1 / distance
    # "binary" - 1 if distance < MAX_GAP_THRESHOLD, 0 otherwise
    # "log" - 1 / log(distance) 

    for paper in pos_in_paper:
        for id1, instance1 in enumerate(instances):
            for id2, instance2 in enumerate(instances):
                if instance1 != instance2:
                    if instance1 not in pos_in_paper[paper]:
                        continue
                    if instance2 not in pos_in_paper[paper]:
                        continue
                    positions1 = pos_in_paper[paper][instance1]
                    positions2 = pos_in_paper[paper][instance2]
                    if positions1 and positions2:
                        distance = find_min_distance([positions1, positions2])
                        if distance < 0:
                            # print(f"Error: {instance1} and {instance2} not found in {paper}")
                            continue
                        
                        result = 0.0
                        if distance == 0:
                            result = 1
                        elif mode == "sqrt":
                            result = 1 / np.sqrt(distance)
                        elif mode == "linear":
                            result = 1 / distance
                        elif mode == "binary":
                            result = 1 if distance < GAP_TOO_LARGE_THRESHOLD else 0
                        elif mode == "log":
                            result = 1 / np.log(distance)
                        else:
                            print("Error: unknown mode")
                            break
                        if result > 0.0:
                            instance_instance_proximity_matrix[id1][id2] += result

    #TODO rest doesnt seem to work, short fix implemented:
    # create a copy of labels that only contains instances that are in the proximity matrix

    instance_instance_proximity_matrix, deletions = remove_zeros(instance_instance_proximity_matrix)
    proximity_instances = handle_deletions(instances, deletions, rows=False)
    
    return instance_instance_proximity_matrix, proximity_instances

instance_instance_proximity_matrix, proximity_instances = calculate_proximity_matrix(pos_in_paper, instances)

# ToDo for quantitative analysis
* tf-idf only on terms
* arrange the papers on a timeline and identify the flow of:
  * Processes
  * File formats
  * software
  * ...
  * Compare this to goolge trends

In [11]:
from mlxtend.frequent_patterns import apriori

# AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'
dataframe = pd.DataFrame(paper_instance_occurrence_matrix, columns=instances).astype(bool)

# for each process:
# create one res

res = apriori(dataframe, min_support=0.4, use_colnames=True, max_len=2)

# visualize res
res = res.sort_values(by='support', ascending=False)
res = res.reset_index(drop=True)
# res

In [12]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(res)
# sort rules by confidence
rules = rules.sort_values(by='confidence', ascending=False)
# rules = rules.sort_values(by='lift', ascending=False) # (propably most important)
# rules = rules.sort_values(by='leverage', ascending=False)
# export rules to csv

In [13]:
def identify_cross_type_rules(rules):
    cross_type = [False] * len(rules)

    for i, antecentent in enumerate(rules.antecedents):
        antecentent, = antecentent
        consequent, = rules.iloc[i].consequents
        type1, type2 = None, None
        for type in instance_types_dicts:
            if antecentent in instance_types_dicts[type]:
                type1 = type
            if consequent in instance_types_dicts[type]:
                type2 = type
            if type1 and type2:
                break
        if type1 != type2:
            cross_type[i] = True
            # print(rules.iloc[i])

    # create a copy for all rules that are cross type
    cross_type_rules = rules[cross_type].copy()
    return cross_type_rules

cross_type_rules = identify_cross_type_rules(rules)

# Output

## Functions

In [14]:
# represent a dict
import csv
import os
from itables import init_notebook_mode, show

# better represent dataframes
if not for_git:
    init_notebook_mode(all_interactive=True)
    
def process_dict(input_dict, filename="some_dict", path=None):
    # convert all sets to lists
    for key in input_dict:
        if isinstance(input_dict[key], set):
            input_dict[key] = list(input_dict[key])

    if path is None:
        path = get_output_path()
    filepath = os.path.join(path, filename)
    with open(filepath + '.json', 'w', encoding="utf-8") as f:
        json.dump(input_dict, f, ensure_ascii=False, indent=4)

        
    container = [
        ["Instance", "Min", "Max", "Mean", "Median", "Std"]
    ]

    for instance, papers in instance_piece_gap.items():

        # print(f"Instance: {instance}")
        gaps = papers.values()
        # generate all kinds of statistical values
        min_gap = min(gaps)
        max_gap = max(gaps)
        mean_gap = sum(gaps) / len(gaps)
        median_gap = np.median(list(gaps))
        std_gap = np.std(list(gaps))
        container.append([instance, min_gap, max_gap, mean_gap, median_gap, std_gap])

    filepath = os.path.join(path, filename)

    # TODO: Handle CSV separator
    # if not for_git:
    # Function to convert a single value
    # def convert_decimal_delimiter(value, decimal=CSV_DECIMAL):
    #     if isinstance(value, float):
    #         return f"{value}".replace('.', decimal)
    #     return value

    # # Convert all floats in your container to strings with the desired decimal delimiter
    # container = [[convert_decimal_delimiter(value) for value in row] for row in container]

    # write to csv
    with open(filepath + ".csv", 'w', newline='') as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerows(container)   

def process_dataframe(input_df, name = "some_df", path=None):
    if path is None:
        path = get_output_path()
    filepath = os.path.join(path, name)
    
    input_df.to_csv(filepath + '.csv', sep=CSV_SEPARATOR, decimal=CSV_DECIMAL)
    show(input_df)

In [15]:
# visualize co-occurrences
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import math

def visualize_matrix(matrix: np.ndarray, rows: list[str], columns: list[str] = None, name: str = 'some_matrix', format = '.png', path = None) -> None:
    """
    Visualizes a matrix as a heatmap.
    matrix: The matrix to visualize
    rows: The labels for the rows
    columns: The labels for the columns
    name: The name of the file to save
    format: The format of the file to save (default: '.png', also accepts '.svg' and '.pdf', also accepts a list of formats)
    """
    if columns is None:
        columns = rows

    if path is None:
        path = get_output_path()

    ## Calculate the maximum size of the plot
    dpi = 300
    max_pixel = 2**16  # Maximum size in any direction
    max_size = max_pixel / dpi  # Maximum size in any direction
    max_size_total = max_size * max_size # Maximum size in total
    max_size_total *= 0.05 # produce smaller files

    # Experience value of space required per cell
    factor = 0.18
    size_x: float = 2 + len(columns) * factor
    size_y: float = 3 + len(rows) * 0.8 * factor

    while size_x * size_y < max_size_total and dpi < 600:
        dpi /= 0.95 
        max_size_total *= 0.95

    if dpi > 600:
        dpi = 600

    while size_x * size_y > max_size_total:
        dpi *= 0.95 
        max_size_total /= 0.95

    fig, ax = plt.subplots(figsize=(size_x, size_y), dpi=dpi)

    cax = ax.matshow(matrix, cmap='viridis')

    # use labels from instance_occurrences
    ax.set_xticks(range(len(columns)))
    ax.set_xticklabels(list(columns), fontsize=10, rotation=90)
    ax.set_yticks(range(len(rows)))
    ax.set_yticklabels(list(rows), fontsize=10)

    # # adjust the spacing between the labels
    # plt.gca().tick_params(axis='x', which='major', pad=15)
    # plt.gca().tick_params(axis='y', which='major', pad=15)

    # show the number of co-occurrences in each cell, if greater than 0
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            if matrix[i, j] == 0:
                continue
            # if co_occurrences[i, j] > 100:
            #     continue
            plt.text(j, i, round(matrix[i, j], 2), ha='center', va='center', color='white', fontsize=4)

    # plt.show()
    fig.tight_layout()

    # title
    plt.title(name)

    if isinstance(format, list):
        for f in format:
            if f[0] != '.':
                f = '.' + f
            filepath = os.path.join(path, name + f)
            fig.savefig(filepath)
    else:
        if format[0] != '.':
            format = '.' + format
        filepath = os.path.join(path, name + format)
        fig.savefig(filepath)

def visualize_matrix_graph(matrix, instances, instance_types_dicts, name='some_matrix_graph', path=None):
    path = get_output_path(path)

    SEED = 17
    K_SPRRING = 18

    scale = len(instances) * .12
    # Create a new figure
    x = scale / 10 * 16
    y = scale / 10 * 9
    fig = plt.figure(figsize=(x, y))

    # normalize the proximity matrix
    matrix = matrix / matrix.max()

    mode = "sqrt"

    # alternatives are:
    # "linear" - take proximity as is
    # "sqrt" - sqrt(proximity)
    # "log" - log(proximity)
    if mode == "log":
        nodesize_map = [np.log(matrix[:, i].sum() + 1) for i in range(len(instances))]
    elif mode == "sqrt":
        nodesize_map = [np.sqrt(matrix[:, i].sum()) for i in range(len(instances))]
    elif mode == "linear":
        nodesize_map = [matrix[:, i].sum()for i in range(len(instances))]
    else:
        nodesize_map = [matrix[:, i].sum() for i in range(len(instances))]
        
    # print(max(nodesize_map))
    # print(min(nodesize_map))

    nodesize_map = np.array(nodesize_map) / max(nodesize_map) * 1000

    # print(max(nodesize_map))
    # print(min(nodesize_map))

    # take the root of the proximity matrix
    while np.min(matrix[np.nonzero(matrix)]) < 1/10:
        matrix = np.sqrt(matrix)

    # Create a graph from the proximity matrix
    G = nx.from_numpy_array(matrix)

    # Specify the layout
    pos = nx.spring_layout(G, seed=SEED, k=K_SPRRING/math.sqrt(G.order()))  # Seed for reproducibility

    color_map = []

    color = {
        "process": "#1f77b4",  # muted blue
        "software": "#ff7f0e",  # safety orange
        "data item": "#2ca02c",  # cooked asparagus green
        "data model": "#d62728",  # brick red
        "data format specification": "#9467bd",  # muted purple
        "interchange format": "#8c564b",  # chestnut brown
        # "source": "#e377c2",  # raspberry yogurt pink
    }

    for instance in instances:
        added = False
        for instance_type in instance_types_dicts:
            if instance in instance_types_dicts[instance_type]:
                color_map.append(color[instance_type])
                added = True
                break
        if not added:
            color_map.append("grey")

    # Draw the graph
    options = {
        "edge_color": "grey",
        "linewidths": 0.5,
        "width": 0.5,
        "with_labels": True,  # This will add labels to the nodes
        "labels": {i: label for i, label in enumerate(instances)},
        "node_color": color_map,
        "node_size": nodesize_map,
        # "edge_color": "white",
        # "alpha": 0.9,
    }

    # print(nx.is_weighted(G))


    # nx.set_edge_attributes(G, values = 1, name = 'weight')

    nx.draw(G, pos, **options, ax=fig.add_subplot(111))

    # Make the graph more spacious
    fig.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.9)

    # Create a patch for each color
    patches = [mpatches.Patch(color=color[key], label=key) for key in color]

    # Add the legend to the graph
    plt.legend(handles=patches, loc='upper right', fontsize='x-large')


    plt.show()

    # save plot to file
    fig.savefig(path + name + '.png')
    fig.savefig(path + name + '.svg')

    # nx.get_edge_attributes(G, 'weight')

def sankey(matrix, instances, instance_types_dicts, name='some_sankey', path = None):
    #TODO: Implement a method to create one graph per Process
    path = get_output_path(path)
    # Convert the proximity matrix into a list of source nodes, target nodes, and values
    sources = []
    targets = []
    values = []

    x_pos=[0] * len(instances)
    y_pos=[0] * len(instances)
    color_map=[0] * len(instances)

    max_types = len(instance_types_dicts)
    type_positions = [0.1 + (i / max_types) * 0.8 for i in range(max_types)]

    color = {
        "process": "#1f77b4",  # muted blue
        "software": "#ff7f0e",  # safety orange
        "data item": "#2ca02c",  # cooked asparagus green
        "data model": "#d62728",  # brick red
        "data format specification": "#9467bd",  # muted purple
        "interchange format": "#8c564b",  # chestnut brown
        # "source": "#e377c2",  # raspberry yogurt pink
    }
    color = list(color.values())

    space = {}

    for i in range(matrix.shape[0]):
        source_type = None

        for j in range(matrix.shape[1]):
            target_type = None
            
            for type_depth, type in enumerate(instance_types_dicts):
                if instances[i] in instance_types_dicts[type]:
                    source_type = type_depth
                if proximity_instances[j] in instance_types_dicts[type]:
                    target_type = type_depth

            # only keep directly forward moving connections
            if target_type - source_type != 1:
                continue

            # only keep forward moving connections
            if target_type - source_type <= 0:
                continue

            if source_type not in space:
                space[source_type] = {}
            if i not in space[source_type]:
                space[source_type][i] = 0
            space[source_type][i] += matrix[i][j]
            
            if target_type not in space:
                space[target_type] = {}
            if j not in space[target_type]:
                space[target_type][j] = 0
            space[target_type][j] += matrix[i][j]

            x_pos[i] = type_positions[source_type]
            x_pos[j] = type_positions[target_type]
            color_map[i] = color[source_type]
            color_map[j] = color[target_type]
            if matrix[i][j] > 0.0:  # Ignore zero values
                sources.append(i)
                targets.append(j)
                values.append(matrix[i][j])

    for type in space:
        sum_values = sum(space[type].values())
        space[type] = {k: v/sum_values for k, v in sorted(space[type].items(), key=lambda item: item[1], reverse=True)}

    # assign each instance a proper y position
    for type in space:
        bottom = 0.1
        for i, instance in enumerate(space[type]):
            y_pos[instance] = bottom
            bottom += space[type][instance] * 0.8

    nodes = dict(
        # pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=proximity_instances,  # Use your labels here
        color=color_map,
        x=x_pos,
        y=y_pos,
        align="right",
    )

    # Create a Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node=nodes,
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    )])


    fig.update_layout(width=1920, height=1080)


    fig.update_layout(title_text="Sankey Diagram", font_size=10)
    # fig.show()
    fig.write_image(path + name + '.png')
    fig.write_image(path + name + '.svg')
    fig.write_html(path + name + '.html')

# Represent a matrix
def process_matrix(matrix, rows, columns=None, name = 'some_matrix', visualize = True, path = None, instance_types_dicts = None):
    if columns is None:
        columns = rows
    if path is None:
        path = get_output_path()
    df = pd.DataFrame(matrix, columns=columns, index=rows)
    filepath = os.path.join(path, name)
    df.to_csv(filepath + '.csv', sep=CSV_SEPARATOR, decimal=CSV_DECIMAL)
    if visualize:
        if instance_types_dicts:
            sankey(matrix, rows, instance_types_dicts, name + '_sankey', path=path)
            visualize_matrix_graph(matrix, rows, instance_types_dicts, name + '_graph', path=path)
        visualize_matrix(matrix, rows, columns, name, path=path)

In [16]:
# visualize timeline
import numpy as np
import matplotlib.pyplot as plt
import math

def visualize_timeline(year_instance_occurrence_matrix, year_papers, instances, instance_types_dicts, name='some_timeline', path=None, recursion_depth=0, start_index=0, error_matrix=None, error_instances=None):
    path = get_output_path(path)
    years = list(year_papers.keys())
    max_papers = max([len(year_papers[year]) for year in years])
    yearly_papers = [len(year_papers[year]) for year in years]


    ALPHA_ERROR_LINE = 0.3
    ALPHA_ERROR_ZONE = 0.2
    ALPHA_PAPER_BAR = 0.3


    for type in instance_types_dicts:
        use = [instance in instance_types_dicts[type] for instance in instances]
        type_instances = [instance for instance, use_flag in zip(instances, use) if use_flag]
        total_occurrences = [np.sum(year_instance_occurrence_matrix[:, instances.index(instance)]) for instance in type_instances]
        type_instances_sorted = [x for _, x in sorted(zip(total_occurrences, type_instances), key=lambda pair: pair[0], reverse=True)]
        
        PARTITION_SIZE = 10
        # if error_instances is not None:
        #     PARTITION_SIZE = int(0.5 * PARTITION_SIZE)
        
        type_matrix = year_instance_occurrence_matrix[:, [instances.index(instance) for instance in type_instances_sorted]]
        factor = 1
        size_x = (2 + len(years) / 6) * factor
        size_y = (2 + max_papers / 15) * factor
        fig, ax = plt.subplots(figsize=(size_x, size_y), dpi=300)

        ax.set_xticks(range(len(years)))
        years_labels = [year if len(year_papers[year]) > 0 else '' for year in years]
        ax.set_xticklabels(years_labels, fontsize=10, rotation=90)
        
        step_size = max(1, math.ceil(max_papers / 10))
        ax.set_yticks(np.arange(0, max_papers + 1, step=step_size))
        ax.set_yticklabels([str(int(x)) for x in np.arange(0, max_papers + 1, step=step_size)], fontsize=10)

        plt.bar(range(len(years)), yearly_papers, color='black', alpha=ALPHA_PAPER_BAR, label=f"Total papers ({sum(yearly_papers)})", zorder=0)

        line_count = 0
        i = start_index
        while line_count < PARTITION_SIZE and i < len(type_instances_sorted):
            instance = type_instances_sorted[i]
            yearly_occurrences = type_matrix[:, i]
            i_total_occurrences = yearly_occurrences.sum()
            label = f"{instance} ({i_total_occurrences})"
            values = yearly_occurrences
            line = plt.plot(range(len(years)), values, label=label, zorder=3)[0]
            line_count += 1
            if error_matrix is not None and instance in error_instances:
                color = line.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                line.set_label(f"{instance} ({i_total_occurrences}-{sum(errors_plus)})")
                # Plot the error as a half transparent line on top of the normal line
                plt.plot(range(len(years)), errors_plus, color=color, alpha=ALPHA_ERROR_LINE, label=f"{instance} (error)", zorder=2)
                line_count += 1
                # color in the area between the normal line and the error line
                plt.fill_between(range(len(years)), yearly_occurrences, errors_plus, color=color, alpha=ALPHA_ERROR_ZONE, zorder=1)
            i += 1
                
                # plt.scatter(range(len(years)), errors, color='red', label=f"{instance} (error)", zorder=1)
        stop_index = i

        plt.legend()

        plt.title(f"Timeline of {type} instances (#{start_index+1} to #{stop_index} of {len(type_instances_sorted)})")

        # Inset for relative values
        fig.canvas.draw()
        x_lim = ax.get_xlim()  # Get the current x-axis limits from the main plot

        bbox = ax.get_position()
        bb_left, bb_bottom = bbox.x0, bbox.y0
        bb_width, bb_height = bbox.width, bbox.height

        ax_inset = plt.axes([bb_left, 0.05, bb_width, 0.15], alpha=ALPHA_PAPER_BAR, facecolor='lightgrey')
        for i, instance in enumerate(type_instances_sorted[start_index:stop_index], start=start_index):
            yearly_occurrences = type_matrix[:, i]
            values_relative = [occurrences / papers if papers > 0 else 0 for occurrences, papers in zip(yearly_occurrences, yearly_papers)]
            line_relative = ax_inset.plot(range(len(years)), values_relative, label=f"{instance} (relative)", zorder=3)[0]

            # add the error part
            if error_matrix is not None and instance in error_instances:
                color = line_relative.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                errors_relative = [error / papers if papers > 0 else 0 for error, papers in zip(errors_plus, yearly_papers)]
                if max(errors_relative) > 1:
                    print(f"Error: {instance} has a relative error > 1")
                    # throw an exception because this should never be the case:
                    # raise Exception(f"Error: relative {instance} occurence + error > 1")


                ax_inset.plot(range(len(years)), errors_relative, alpha=ALPHA_ERROR_LINE, color=color, label=f"{instance} (error, relative)", zorder=2)
                # color in the area between the normal line and the error line
                ax_inset.fill_between(range(len(years)), values_relative, errors_relative, alpha=ALPHA_ERROR_ZONE, color=color, zorder=1)
        
        ax_inset.set_xlim(x_lim)

        ax_inset.set_xticks([])
        ax_inset.set_yticks(np.arange(0, 1.1, step=0.5))
        ax_inset.set_yticklabels([f"{int(x*100)}%" for x in np.arange(0, 1.1, step=0.5)], fontsize=8)

        plt.subplots_adjust(bottom=0.3)

        start_string = f"{start_index+1}"
        stop_string = f"{stop_index}"

        # fill up with 0 to have a constant length
        start_string = "0" * (3 - len(start_string)) + start_string
        stop_string = "0" * (3 - len(stop_string)) + stop_string

        part_appendix = f"{start_string}_to_{stop_string}"
        plt.savefig(f"{path}{name}_{type.replace(' ', '_')}_{part_appendix}.png")
        plt.close()
        
        start_index = stop_index
        if start_index < len(type_instances_sorted):
            # if recursion_depth > 0:
            #     break
            visualize_timeline(year_instance_occurrence_matrix, year_papers, instances, {type: instance_types_dicts[type]}, name, path=path, recursion_depth=recursion_depth + 1, start_index=start_index, error_matrix=error_matrix, error_instances=error_instances)
        start_index = 0
            
if visualize:
    yearly_error_matrix, year_error_papers = create_year_paper_occurrence_matrix(papers_metadata, error_matrix, error_papers, is_error_matrix=True)
    visualize_timeline(year_instance_occurrence_matrix, year_papers, instances, instance_types_dicts, name="year_instance_occurrence_matrix", error_matrix=yearly_error_matrix, error_instances=error_instances) 

## Files

In [17]:
# All Dicts: instance_types_dicts, papers_metadata, instance_piece_gap
process_dict(instance_types_dicts, 'instance_types_dicts')
process_dict(papers_metadata, 'papers_metadata')
process_dict(instance_piece_gap, 'instance_piece_gaps')

### Rules

In [18]:
process_dataframe(rules, 'rules')
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
Loading ITables v2.1.1 from the internet... (need help?),,,,,,,,,,


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
299,(detailed design),(detailed),0.501493,0.690547,0.501493,1.000000,1.448127,0.155188,inf,0.620758
1,(system design),(design),0.926368,0.973134,0.926368,1.000000,1.027607,0.024888,inf,0.364865
350,(manufacturing),(design),0.472637,0.973134,0.472637,1.000000,1.027607,0.012698,inf,0.050943
386,(section description),(design),0.451741,0.973134,0.451741,1.000000,1.027607,0.012136,inf,0.049002
387,(section description),(description),0.451741,0.813930,0.451741,1.000000,1.228606,0.084055,inf,0.339383
...,...,...,...,...,...,...,...,...,...,...
366,(check),(processing),0.577114,0.732338,0.463682,0.803448,1.097100,0.041039,1.361788,0.209291
196,(database),(system integration),0.713433,0.722388,0.573134,0.803347,1.112072,0.057759,1.411686,0.351671
59,(system design),(evaluation),0.926368,0.781095,0.743284,0.802363,1.027229,0.019702,1.107614,0.359999
267,(product design),(knowledge engineering),0.653731,0.669652,0.524378,0.802131,1.197833,0.086606,1.669529,0.476968


In [19]:
process_dataframe(cross_type_rules, 'cross_type_rules')
cross_type_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
Loading ITables v2.1.1 from the internet... (need help?),,,,,,,,,,


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
386,(section description),(design),0.451741,0.973134,0.451741,1.000000,1.027607,0.012136,inf,0.049002
448,(expert systems),(design),0.427861,0.973134,0.426866,0.997674,1.025218,0.010500,11.552239,0.042992
474,(api),(design),0.414925,0.973134,0.413930,0.997602,1.025143,0.010152,11.202985,0.041920
442,(difficulty),(design),0.432836,0.973134,0.430846,0.995402,1.022883,0.009638,5.843284,0.039443
249,(word),(design),0.538308,0.973134,0.535323,0.994455,1.021909,0.011477,4.844776,0.046436
...,...,...,...,...,...,...,...,...,...,...
126,(knowledge base),(review),0.805970,0.761194,0.648756,0.804938,1.057468,0.035257,1.224258,0.280085
266,(system definition),(processing),0.652736,0.732338,0.525373,0.804878,1.099052,0.047349,1.371766,0.259529
195,(database),(review),0.713433,0.761194,0.574129,0.804742,1.057210,0.031069,1.223028,0.188836
196,(database),(system integration),0.713433,0.722388,0.573134,0.803347,1.112072,0.057759,1.411686,0.351671


### Paper x Instance

In [20]:
process_matrix(paper_instance_occurrence_matrix, rows=papers, columns=instances, name='paper_instance_occurrence_matrix', visualize=visualize)

In [21]:
process_matrix(error_matrix, rows=error_papers, columns=error_instances, name='error_matrix', visualize=visualize)

### Instance x Instance

In [22]:
process_matrix(instance_instance_co_occurrence_matrix, rows=instances, columns=instances, name='instance_instance_co_occurrence_matrix', visualize=visualize)

In [23]:
process_matrix(instance_instance_proximity_matrix, rows=proximity_instances, columns=proximity_instances, name='instance_instance_proximity_matrix', visualize=visualize, instance_types_dicts=instance_types_dicts)

# Later
* Word Embeedding
  * Find out, that jpeg and png are similar

# Much Later
Maybe, just maybe, make 
* Paper classes
* Subclasses of paper classes
* model which process is a subprocess of another process