# Evaluate Instance occurrence in Papers

## Setup

In [None]:
from bnw_tools.SLR.config import Config

config = {
    "for_git": True,
    "visualize": False,
    "csv_separator": ",",
    "csv_decimal": ".",
    ## Should only accepted papers be used for the analysis?
    "only_included_papers": True,
    ## Which instance columns actually indicate properties?
    "properties": ["source"],
    "proximity_mode": "sqrt",
    ## Paths
    "base_path": "data/",
    "subset_path": "data_subset/",
    "visualization_path": "visualization/",
    "ontology_path": "ontology/",
    "orkg_path": "ORKG/",
    "folder_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR",
    "papers_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/02_nlp",
    "review_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/03_notes",
    "csv_file": "C:/workspace/borgnetzwerk/tools/scripts/SLR/data.csv",
    "obsidian_path": "ontology/obsidian/",
    ## Position in Paper settings
    "gap_too_large_threshold": 100,
    "savetime_on_fulltext": False,
    "try_to_save_time": False,
    "recalculate_pos_in_paper": False,
    "debug": True,
    ## Wikidata settings
    "wikidata_query_limit": 20,
    ## Graph settings
    "proximity_seed": 17,
    "proximity_k_spring": 18,
    "proximity_min_value": 0.1,
}


# config = Config(**config)
config = Config(config)

## Functions

In [None]:
from bnw_tools.SLR.builder import *
obsidian_folder = ObsidianFolder(config=config)

classes = {label: _class for label, _class in obsidian_folder.classes.items()}
instances = {label: instance for label, instance in obsidian_folder.instances.items()}
papers = {label: paper for label, paper in obsidian_folder.papers.items()}

order = ["process", "software", "data item", "data model", "data format specification"]
pos_in_order = {o: i for i, o in enumerate(order)}
instances_by_class = {o: {} for o in order}

for label, instance in instances.items():
    instance_of = instance.get("instance_of", [])
    if not instance_of:
        continue
    positions = [pos_in_order[i] for i in instance_of if i in pos_in_order]
    if not positions:
        print(f"Instance {label} has no class other than {instance_of}")
        continue
    class_ = order[min(positions)]
    instances_by_class[class_][label] = instance

In [None]:
# from bnw_tools.SLR.builder import *
# def get_papers(config):
#     papers = {}
#     for file in os.listdir(config.papers_path):
#         if file.endswith(".json"):
#             papers[file[:-5]] = {
#                 "nlp_path": path_cleaning(
#                     os.path.join(config.papers_path, file)
#                 )
#             }
#     return papers

# papers = get_papers(config)

#### Reduce to Reviewed papers

In [None]:
def sort_papers(papers, reverse=False):
    papers = {
        x: papers[x]
        for x in sorted(
            papers,
            key=lambda x: (
                getattr(papers[x], "year", "9999")
                if hasattr(papers[x], "year")
                else "9999"
            ),
            reverse=reverse,
        )
    }
    return papers

def reduce_to_reviewed_papers(papers, config):
    review_path = config.review_path
    # todo: sort by review score + average rank
    ## TODO: Make this a function that imports more data from the reivew files
    included_identifier = {
        3: "review_score:: 3",
        4: "review_score:: 4",
        5: "review_score:: 5",
    }
    excluded_identifier = {
        2: "review_score:: 2",
        1: "review_score:: 1",
        0: "review_score:: 0",
    }
    included_papers = {}
    excluded_papers = {}
    for file in os.listdir(review_path):
        if file.endswith(".md"):
            paper_name = file[:-3]
            if paper_name in papers:
                paper = papers[paper_name]
                if (
                    paper_name in included_papers
                    or paper_name in excluded_papers
                ):
                    continue
                # check if file contains "reviewed"ArithmeticError
                with open(
                    os.path.join(review_path, file), "r", encoding="utf8"
                ) as f:
                    content = f.read()
                    for score, text in included_identifier.items():
                        if text in content:
                            paper.review_score = score
                            included_papers[paper_name] = paper
                            break
                    for score, text in excluded_identifier.items():
                        if text in content:
                            excluded_papers[paper_name] = paper
                            break
    if config.only_included_papers:
        papers = {k: v for k, v in included_papers.items()}
    papers = sort_papers(papers)
    if included_papers:
        included_papers = sort_papers(included_papers)
    if excluded_papers:
        excluded_papers = sort_papers(excluded_papers)
    return papers, included_papers, excluded_papers

papers, included_papers, excluded_papers = reduce_to_reviewed_papers(papers, config)

## Instance Setup done.
Proceeding to:

## Matrix calculations

### Instance Occurrence Matrix

In [None]:
# def count_occurrences( papers, instances):
#     occurrences = np.zeros((len(papers), len(instances)), dtype=int)

#     for p, paperpath in enumerate(papers.values()):
#         if isinstance(paperpath, dict) or isinstance(paperpath, Instance):
#             paperpath = paperpath.get("nlp_path", None)
#         with open(paperpath, "r", encoding="utf8") as f:
#             paper = json.load(f)
#             for i, instance in enumerate(instances):
#                 present = True
#                 pieces = split_string(instance)
#                 for piece in pieces:
#                     if piece.lower() not in paper["bag_of_words"]:
#                         present = False
#                         break

#                 if present:
#                     occurrences[p][i] = 1
#     return occurrences


def count_occurrences(papers, instances:dict[str,Instance]):
    # Create an empty DataFrame with papers as rows and instances as columns
    df = pd.DataFrame(0, index=papers.keys(), columns=instances.keys(), dtype=int)

    for paper_id, paperpath in papers.items():
        if isinstance(paperpath, dict) or isinstance(paperpath, Instance):
            paperpath = paperpath.get("nlp_path", None)
        with open(paperpath, "r", encoding="utf8") as f:
            paper = json.load(f)
            for instance_label, instance in instances.items():

                ### NONE IN INSTACNES

                label = instance.get("label", None)
                aliases = instance.get("aliases", [])
                candidates = set().union([label], [instance_label], aliases)
                if None in candidates:
                    print("Instance without label found:")
                    print(json.dumps(instance.__dict__, indent=4))
                    raise ValueError("Instance without label found")
                candidates.discard(None)

                for name in candidates:
                    present = True
                    pieces = split_string(name)
                    for piece in pieces:
                        if piece.lower() not in paper["bag_of_words"]:
                            present = False
                            break
                    if present:
                        df.at[paper_id, instance_label] = 1
                        break
    return df

occurrence_matrix = count_occurrences(papers, instances)

In [None]:
def remove_zeros_np(matrix, columns=True, rows=True):
    # remove all columns that are all zeros
    deleted_columns, deleted_rows = np.array([]), np.array([])

    if columns:
        deleted_columns = np.all(matrix == 0, axis=0)
        matrix = matrix[:, ~np.all(matrix == 0, axis=0)]

    # remove all rows that are all zeros
    if rows:
        deleted_rows = np.all(matrix == 0, axis=1)
        matrix = matrix[~np.all(matrix == 0, axis=1)]

    deletions = [deleted_columns, deleted_rows]
    return matrix, deletions

def reorder_matrix_np(matrix:np.array, new_order, cols=True):
    if cols:
        matrix = matrix[:, new_order]
    else:
        matrix = matrix[new_order, :]
    remove_zeros_np(matrix)

In [None]:
def remove_zeros(matrix:pd.DataFrame, columns=True, rows=True):
    deleted_columns, deleted_rows = np.array([]), np.array([])

    if columns:
        deleted_columns = matrix.columns[(matrix == 0).all(axis=0)]
        matrix = matrix.loc[:, (matrix != 0).any(axis=0)]

    if rows:
        deleted_rows = matrix.index[(matrix == 0).all(axis=1)]
        matrix = matrix.loc[(matrix != 0).any(axis=1), :]

    deletions = [deleted_columns, deleted_rows]
    return matrix, deletions

def reorder_matrix(matrix:pd.DataFrame, new_order, cols=True):
    if cols:
        matrix = matrix.iloc[:, new_order]
    else:
        matrix = matrix.iloc[new_order, :]
    matrix, deletions = remove_zeros(matrix)
    return matrix, deletions

def sort_instances(instances, instances_by_class, matrix:pd.DataFrame):
    indexed_instances = {
        instance: i for i, instance in enumerate(instances.keys())
    }

    instance_occurrences = {instance_label:sum(matrix[instance_label]) for instance_label in instances.keys()}

    sorted_instances = {
        k: float(v)
        for k, v in sorted(
            instance_occurrences.items(), key=lambda item: item[1], reverse=True
        )
        if v > 0
    }

    filepath = os.path.join(config.get_output_path(), "instance_occurrences")
    with open(filepath + ".json", "w", encoding="utf-8") as f:
        json.dump(sorted_instances, f, ensure_ascii=False, indent=4)

    sorted_instance_list = list(sorted_instances.keys())

    type_lists = [[] for _ in range(len(instances_by_class))]
    for instance in sorted_instance_list:
        for type_ID, instance_type in enumerate(instances_by_class):
            if instance in instances_by_class[instance_type]:
                type_lists[type_ID].append(instance)
    type_sorted_instances = [item for sublist in type_lists for item in sublist]

    new_order = [0] * len(sorted_instance_list)
    sorted_instances = {}
    for i, instance in enumerate(type_sorted_instances):
        new_order[i] = indexed_instances[instance]
        sorted_instances[instance] = instances[instance]

    # sort all matrixes accordingly
    new_order = np.array(new_order)

    matrix, deletions = reorder_matrix(matrix, new_order)
    
    return instances, matrix

instances, occurrence_matrix = sort_instances(instances, instances_by_class, occurrence_matrix)

In [None]:
occurrence_matrix

In [None]:
# get all text files
def get_paper_full_text(directory, papers:dict[str,Instance]=None):
    paper_full_text = {}
    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".txt"):
                    file_path = os.path.join(folder_path, file)
                    paper_full_text[file[:-4]] = file_path
                    break
    if papers:
        for paper_label, paper in papers.items():
            paper.__setattr__("full_text_path", paper_full_text.get(paper_label, None))
        return papers
    else:
        return paper_full_text

papers = get_paper_full_text(
    "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/00_PDFs", papers
)

In [None]:
# def replace_nan(df:pd.DataFrame, col, what):
#     # if isinstance(col, list):
#     #     for c in col:
#     #         replace_nan(df, c, what)
#     #     return df
#     # nans = df[col].isnull()
#     # df.loc[nans, col] = [what for isnan in nans.values if isnan]
#     idx = df.isna()
#     if idx.empty:
#         return df
#     df.where(idx, what, inplace=True)
#     # df.loc[idx, col] = [what] * idx.sum()
#     # df.iloc[idx, col] = what
#     return df

from ast import literal_eval


def replace_nan_with_empty_list(df: pd.DataFrame) -> pd.DataFrame:
    return df.map(lambda x: [] if pd.isna(x) else x)


class PosInPaper:
    def __init__(
        self,
        config,
        papers: dict[str:Instance] = None,
        instances: dict[str:Instance] = None,
        save = True
    ):
        self.mode = getattr(config, "search_in_text_mode", "lower")
        self.words = []
        self.papers: dict[str:Instance] = {}
        self.instances: dict[str:Instance] = {}

        self.matches = {}

        self.word_occurrences_in_papers = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.words, dtype=object
        )

        self.word_combinations = {}
        self.word_combination_min_distance = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.word_combinations.keys(), dtype=int
        )

        self.instance_word_combinations = {k: [] for k in self.instances.keys()}
        self.instance_min_distance_in_papers = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.instances.keys(), dtype=int
        )


        self.load()

        if papers:
            self.update_papers(papers)
        if instances:
            self.update_instances(instances)
            # self.update_instances(dict(list(instances.items())[0:2]))

        if save:
            self.save()

    def save(self):
        # write all data to json, except dataframes to csv
        data = {}
        for key, value in self.__dict__.items():
            if key in ["papers", "instances"]:
                continue
            if isinstance(value, pd.DataFrame):
                if value.empty:
                    continue
                try:
                    if isinstance(value.iloc[0, 0], list):
                        # # data[key].to_json(key+'.json', orient='records', lines=True)
                        # data[key].to_json(key+'.json')
                        # # with open(f"data/{key}.json", "w", encoding="utf-8") as f:
                        # #     json.dump(data[key].to_json(orient="split"), f, ensure_ascii=False, indent=4)
                        result = value.to_json(orient="split")
                        parsed = json.loads(result)
                        # json.dumps(parsed, indent=4)
                        with open(f"data/{key}.json", "w", encoding="utf-8") as f:
                            json.dump(parsed, f, ensure_ascii=False)
                    else:
                        with open(f"data/{key}.csv", "w", encoding="utf-8") as f:
                            value.to_csv(f, lineterminator='\n')
                except Exception as e:
                    print(f"Error saving {key}: {e}")
                continue
            elif isinstance(value, dict):
                data[key] = {}
                for k, v in value.items():
                    if isinstance(v, Instance):
                        data[key][k] = v.__dict__
                    if isinstance(v, set):
                        data[key][k] = list(v)
            elif key == "word_combinations":
                data[key] = [list(x) for x in value]
            elif key == "matches":
                data[key] = {k: list(v) for k, v in value.items()}
            else:
                data[key] = value
        with open("data/pos_in_paper.json", "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

    def load(self):
        # TODO: Load Dataframes, sets, Instance, dicts of anything
        for key, value in self.__dict__.items():
            if isinstance(value, pd.DataFrame):
                if value.empty:
                    # do not overwrite
                    continue
                if os.path.exists(f"data/{key}.csv"):
                    value = pd.read_csv(f"data/{key}.csv", index_col=0)
                    # data[key] = data[key].applymap(literal_eval)
                    for col in data[key].columns:
                        value[col] = value[col].apply(literal_eval)
                elif os.path.exists(f"data/{key}.json"):
                    with open(f"data/{key}.json", "r", encoding="utf-8") as f:
                        value = pd.read_json(f, orient="split")

            else:
                if value:
                    # do not overwrite
                    continue
                elif os.path.exists(f"data/{key}.json"):
                    with open(f"data/{key}.json", "r", encoding="utf-8") as f:
                        data[key] = json.load(f)
        if os.path.exists("data/pos_in_paper.json"):
            with open("data/pos_in_paper.json", "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError as e:
                    print(f"Error loading data: {e}")
                    return
                for key, value in data.items():
                    if key == "word_combinations":
                        value = {frozenset(x): i for i, x in enumerate(value)}
                    current = getattr(self, key)
                    if isinstance(current, pd.DataFrame):
                        continue
                    elif isinstance(current, dict):
                        if key in ["papers", "instances"]:
                            continue
                        if key == "matches":
                            for k, v in value.items():
                                if k not in self.matches:
                                    self.matches[k] = set()
                                self.matches[k].update(v)
                            continue
                    if not current:
                        setattr(self, key, value)

    # def get_word_combinations_from_instances(self):
    #     word_combinations = list(self.word_combinations.keys())

    def reindex(self):
        self.word_occurrences_in_papers = self.word_occurrences_in_papers.reindex(
            self.papers.keys(),
            columns=self.words
        )
        self.word_combination_min_distance = self.word_combination_min_distance.reindex(
            self.papers.keys(),
            columns=self.word_combinations.keys()
        )
        self.instance_min_distance_in_papers = self.instance_min_distance_in_papers.reindex(
            self.papers.keys(),
            columns=self.instances.keys()
        )

    def update_papers(self, papers: dict[str:Instance]):
        self.papers.update(papers)
        self.papers = sort_papers(self.papers)
        self.reindex()

    def update_instances(self, instances: dict[str, Instance]):
        self.instances.update(instances)
        for instance_label, instance in instances.items():
            candidates = instance.get_all_names().union([instance_label])
            candidate_words = self.update_words(candidates)
            for words in candidate_words.values():
                frozenset_words = frozenset(words)
                pos = len(self.word_combinations)
                if frozenset_words not in self.word_combinations:
                    self.word_combinations[frozenset_words] = pos
                else:
                    pos = self.word_combinations[frozenset_words]
                if instance_label not in self.instance_word_combinations:
                    self.instance_word_combinations[instance_label] = []
                if pos not in self.instance_word_combinations[instance_label]:
                    self.instance_word_combinations[instance_label].append(pos)
        self.reindex()

    def update_words(self, words):
        res = {candidate: [] for candidate in words}
        for candidate in words:
            for word in split_string(candidate):
                if self.mode == "lower":
                    word = word.lower()
                if word not in self.words:
                    self.words.append(word)
                res[candidate].append(word)
        self.words.sort()
        self.matches.update({word: set() for word in self.words})
        return res

    def find_occurrences_in_texts(self, save = True):
        self.word_occurrences_in_papers = replace_nan_with_empty_list(
            self.word_occurrences_in_papers
        )
        for paper_label, paper in self.papers.items():
            if hasattr(paper, "full_text_path") and paper.full_text_path:
                with open(paper.full_text_path, "r", encoding="utf8") as f:
                    full_text = f.read()
                    if self.mode == "lower":
                        full_text = full_text.lower()

                    # TODO: Work with the idea of word based lookup, not character based lookup
                    # If we lookup words, we loose "Wolfram&Heart" occurences of "&" and "engineer" in "engineers"
                    # If we lookup characters, we need to keep track of the length of the words to have a meaningful distance
                    # words = split_string(full_text)
                    # for wID, word in enumerate(words):
                    #     if word in self.words:
                    #         self.word_occurrences_in_papers.at[paper_label, word].append(wID)

                    for word in self.words:
                        if self.word_occurrences_in_papers.at[paper_label, word]:
                            continue
                        if self.mode == "lower":
                            word = word.lower()
                        pos = full_text.find(word)
                        # if pos == -1:
                        #     self.word_occurrences_in_papers.at[paper_label, word] = None
                        while pos != -1:
                            self.word_occurrences_in_papers.at[
                                paper_label, word
                            ].append([pos, word])

                            next = full_text.find(" ", pos + 1)
                            previous = full_text.rfind(" ", 0, pos)
                            self.matches[word].add(full_text[previous:next])

                            pos = full_text.find(word, pos + 1)
        if save:
            self.save()

    def find_min_distance_by_id(self, paper_label, word_combination):
        distance = self.word_combination_min_distance.at[paper_label,word_combination]

        if distance == -1:
            # word combination not found in paper
            return -1
        if distance == -2 or pd.isna(distance):
            # calculate distance
            pass
        else:
            return distance

        # list_ids = self.word_combination_lists[wcID]
        words = list(word_combination)
        if len(words) == 1:
            occurrences = self.word_occurrences_in_papers.at[paper_label,words[0]]
            if not occurrences:
                self.word_combination_min_distance.at[paper_label,word_combination] = -1
                return -1
            else:
                self.word_combination_min_distance.at[paper_label,word_combination] = 0
                return 0
        # since we have attached global Word IDs to the occurrences, we need to map to their local position
        list_ids_map = {words[i]: i for i in range(len(words))}
        # literals = [list(self.words)[i] for i in list_ids]

        lit_len = [len(i) for i in words]

        for word in words:
            if not self.word_occurrences_in_papers.at[paper_label,word]:
                self.word_combination_min_distance.at[paper_label,word_combination] = -1
                return -1
        # Outsourced to optimize
        # inputs = [[(x, i) for x in self.word_occurrences_in_papers[paperID][wordID]] for i, wordID in enumerate(list_ids)]
        inputs = [
            self.word_occurrences_in_papers.at[paper_label,word] for word in words
        ]

        indices = [lst[0][0] for lst in inputs]
        best = float("inf")

        for item in sorted(sum(inputs, [])):
            if item[0] not in indices:
                continue
            # indices[list_ids_map[item[1]]] = item[0]
            indices[list_ids_map[item[1]]] = item[0]
            arr_min = min(indices)
            best = min(max(indices) - arr_min - lit_len[indices.index(arr_min)], best)
            if best <= 0:
                best = 0
                break
        self.word_combination_min_distance.at[paper_label,word_combination] = best

        return best
    
    def find_all_combinations(self, save = True):

        for paper_label in self.papers:
            for combination in self.word_combinations:
                self.find_min_distance_by_id(paper_label, combination)
        if save:
            self.save()

    def update_instance_min_distances(self, save = True):
        combination_list = list(self.word_combinations.keys())
        for paper_label in self.papers:
            for instance_label, combinations in self.instance_word_combinations.items():
                dist = -1
                for combination in combinations:
                    combination = combination_list[combination]
                    distance = self.find_min_distance_by_id(paper_label, combination)
                    if distance < dist or dist == -1:
                        dist = distance
                self.instance_min_distance_in_papers.at[paper_label,instance_label] = dist
        if save:
            self.save()
pos_in_paper = PosInPaper(config, papers, instances)

In [None]:
pos_in_paper.find_occurrences_in_texts(save=False)

In [None]:
pos_in_paper.find_all_combinations(save=False)

In [None]:
pos_in_paper.update_instance_min_distances(save=False)

In [None]:
pos_in_paper.instance_min_distance_in_papers

In [None]:
pos_in_paper.save()

### Error Matrix

In [None]:
director.paper_full_text = paper_full_text

director.builder["error_matrix_builder"] = ErrorMatrixBuilder(director, pos_in_paper)
director.builder["error_matrix_builder"].build()
director.builder["error_matrix_builder"].save()
director.sort_instances()
director.builder["occurrence_matrix"].save()


In [None]:
def reduce_to_existing(input_dict, matrix, axis=0, name=""):
    before = len(input_dict)
    presumed_name = ["columns", "rows"]
    if not name:
        name = presumed_name[axis]

    if axis == 0:
        # columns, likely instances
        input_dict = {k: v for k, v in input_dict.items() if k in matrix.columns}
    else:
        # rows, likely papers
        input_dict = {k: v for k, v in input_dict.items() if k in matrix.index}
    after = len(input_dict)
    if before != after:
        print(f"Removed {before - after} {name}")
    return input_dict

before_papers = len(papers)

instances = reduce_to_existing(instances, occurrence_matrix, axis=0, name="instances")
papers = reduce_to_existing(papers, occurrence_matrix, axis=1, name="papers")

### Instance_instance Co-occurrence Matrix

In [None]:
instance_instance_co_occurrence_matrix = np.dot(
    director.builder["occurrence_matrix"].matrix.T, director.builder["occurrence_matrix"].matrix
)

In [None]:
instance_instance_relative_co_occurrence_matrix = (
    instance_instance_co_occurrence_matrix
    / np.diag(instance_instance_co_occurrence_matrix)
)

## Additional Visualizations

In [None]:
# visualize timeline
import numpy as np
import matplotlib.pyplot as plt
import math


def visualize_timeline(
    config: Config,
    year_instance_occurrence_matrix,
    year_papers,
    instances,
    instance_types_dicts,
    name="some_timeline",
    path=None,
    recursion_depth=0,
    start_index=0,
    error_matrix=None,
    error_instances=None,
):
    if not path:
        path = config.get_output_path(path, visualization=True)
    years = list(year_papers.keys())
    max_papers = max([len(year_papers[year]) for year in years])
    yearly_papers = [len(year_papers[year]) for year in years]

    ALPHA_ERROR_LINE = 0.3
    ALPHA_ERROR_ZONE = 0.2
    ALPHA_PAPER_BAR = 0.3

    for type in instance_types_dicts:
        use = [instance in instance_types_dicts[type] for instance in instances]
        type_instances = [
            instance for instance, use_flag in zip(instances, use) if use_flag
        ]
        total_occurrences = [
            np.sum(year_instance_occurrence_matrix[:, instances.index(instance)])
            for instance in type_instances
        ]
        type_instances_sorted = [
            x
            for _, x in sorted(
                zip(total_occurrences, type_instances),
                key=lambda pair: pair[0],
                reverse=True,
            )
        ]

        PARTITION_SIZE = 10
        # if error_instances is not None:
        #     PARTITION_SIZE = int(0.5 * PARTITION_SIZE)

        type_matrix = year_instance_occurrence_matrix[
            :, [instances.index(instance) for instance in type_instances_sorted]
        ]
        factor = 1
        size_x = (2 + len(years) / 6) * factor
        size_y = (2 + max_papers / 15) * factor
        size_y_2 = (2 + PARTITION_SIZE / 2) * factor
        size_y = max(size_y, size_y_2)
        fig, ax = plt.subplots(figsize=(size_x, size_y), dpi=300)

        ax.set_xticks(range(len(years)))
        years_labels = [year if len(year_papers[year]) > 0 else "" for year in years]
        ax.set_xticklabels(years_labels, fontsize=10, rotation=90)

        step_size = max(1, math.ceil(max_papers / 10))
        ax.set_yticks(np.arange(0, max_papers + 1, step=step_size))
        ax.set_yticklabels(
            [str(int(x)) for x in np.arange(0, max_papers + 1, step=step_size)],
            fontsize=10,
        )

        # set y axis label
        ax.set_ylabel("absolute", fontsize=10)

        plt.bar(
            range(len(years)),
            yearly_papers,
            color="black",
            alpha=ALPHA_PAPER_BAR,
            label=f"Total papers ({sum(yearly_papers)})",
            zorder=0,
        )

        line_count = 0
        i = start_index
        while line_count < PARTITION_SIZE and i < len(type_instances_sorted):
            instance = type_instances_sorted[i]
            yearly_occurrences = type_matrix[:, i]
            i_total_occurrences = yearly_occurrences.sum()
            label = f"{instance} ({i_total_occurrences})"
            values = yearly_occurrences
            line = plt.plot(range(len(years)), values, label=label, zorder=3)[0]
            line_count += 1
            if error_matrix is not None and instance in error_instances:
                color = line.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                line.set_label(f"{instance} ({i_total_occurrences}-{sum(errors_plus)})")
                # Plot the error as a half transparent line on top of the normal line
                plt.plot(
                    range(len(years)),
                    errors_plus,
                    color=color,
                    alpha=ALPHA_ERROR_LINE,
                    label=f"{instance} (w/o proximity)",
                    zorder=2,
                )
                line_count += 1
                # color in the area between the normal line and the error line
                plt.fill_between(
                    range(len(years)),
                    yearly_occurrences,
                    errors_plus,
                    color=color,
                    alpha=ALPHA_ERROR_ZONE,
                    zorder=1,
                )
            i += 1

            # plt.scatter(range(len(years)), errors, color='red', label=f"{instance} (error)", zorder=1)
        stop_index = i

        plt.legend()

        plt.title(
            f"Number of papers covering {type} instances (#{start_index+1} to #{stop_index} of {len(type_instances_sorted)})"
        )

        # Inset for relative values
        fig.canvas.draw()
        x_lim = ax.get_xlim()  # Get the current x-axis limits from the main plot

        bbox = ax.get_position()
        bb_left, bb_bottom = bbox.x0, bbox.y0
        bb_width, bb_height = bbox.width, bbox.height

        ax_inset = plt.axes(
            [bb_left, 0.05, bb_width, 0.15],
            alpha=ALPHA_PAPER_BAR,
            facecolor="lightgrey",
        )
        for i, instance in enumerate(
            type_instances_sorted[start_index:stop_index], start=start_index
        ):
            yearly_occurrences = type_matrix[:, i]
            values_relative = [
                occurrences / papers if papers > 0 else 0
                for occurrences, papers in zip(yearly_occurrences, yearly_papers)
            ]
            line_relative = ax_inset.plot(
                range(len(years)),
                values_relative,
                label=f"{instance} (relative)",
                zorder=3,
            )[0]

            # add the error part
            if error_matrix is not None and instance in error_instances:
                color = line_relative.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                errors_relative = [
                    error / papers if papers > 0 else 0
                    for error, papers in zip(errors_plus, yearly_papers)
                ]
                if max(errors_relative) > 1:
                    print(f"Error: {instance} has a relative error > 1")
                    # throw an exception because this should never be the case:
                    # raise Exception(f"Error: relative {instance} occurrence + error > 1")

                ax_inset.plot(
                    range(len(years)),
                    errors_relative,
                    alpha=ALPHA_ERROR_LINE,
                    color=color,
                    label=f"{instance} (error, relative)",
                    zorder=2,
                )
                # color in the area between the normal line and the error line
                ax_inset.fill_between(
                    range(len(years)),
                    values_relative,
                    errors_relative,
                    alpha=ALPHA_ERROR_ZONE,
                    color=color,
                    zorder=1,
                )

        ax_inset.set_xlim(x_lim)

        ax_inset.set_xticks([])
        ax_inset.set_yticks(np.arange(0, 1.1, step=0.5))
        ax_inset.set_yticklabels(
            [f"{int(x*100)}%" for x in np.arange(0, 1.1, step=0.5)], fontsize=8
        )

        # set y axis label
        ax_inset.set_ylabel("relative", fontsize=10)

        plt.subplots_adjust(bottom=0.3)

        start_string = f"{start_index+1}"
        stop_string = f"{stop_index}"

        # fill up with 0 to have a constant length
        start_string = "0" * (3 - len(start_string)) + start_string
        stop_string = "0" * (3 - len(stop_string)) + stop_string

        part_appendix = f"{start_string}_to_{stop_string}"
        filepath = os.path.join(path, name)
        plt.savefig(f"{filepath}_{type.replace(' ', '_')}_{part_appendix}.png")
        plt.close()

        start_index = stop_index
        if start_index < len(type_instances_sorted):
            # if recursion_depth > 0:
            #     break
            visualize_timeline(
                config,
                year_instance_occurrence_matrix,
                year_papers,
                instances,
                {type: instance_types_dicts[type]},
                name,
                path=path,
                recursion_depth=recursion_depth + 1,
                start_index=start_index,
                error_matrix=error_matrix,
                error_instances=error_instances,
            )
        start_index = 0


# if config.visualize:
#     yearly_error_matrix, year_error_papers = create_year_paper_occurrence_matrix(
#         papers_metadata, error_matrix, error_papers, is_error_matrix=True
#     )
#     visualize_timeline(
#         config,
#         year_instance_occurrence_matrix,
#         year_papers,
#         instances,
#         instance_types_dicts,
#         name="year_instance_occurrence_matrix",
#         error_matrix=yearly_error_matrix,
#         error_instances=error_instances,
#     )

In [None]:
# Create year_paper_occurrence_matrix
class YearPaperOccurrenceMatrixBuilder(MatrixBuilder):
    def __init__(self, director, papers = None, paper_instance_occurrence_matrix = None, is_error_matrix=False, ):
        super().__init__(director)

        # self.papers_metadata = papers_metadata
        self.papers:dict[str,Instance] = papers or director.papers
        self.paper_instance_occurrence_matrix = paper_instance_occurrence_matrix or director.builder["occurrence_matrix"].matrix
        self.is_error_matrix = is_error_matrix
        self.year_papers:dict[int,dict[str,Instance]] = {}



    def build_matrix(self, paper_instance_occurrence_matrix = None, papers = None, is_error_matrix=False):
        paper_instance_occurrence_matrix = paper_instance_occurrence_matrix or self.paper_instance_occurrence_matrix
        papers = papers or self.papers
        # self.matrix, self.year_papers = create_year_paper_occurrence_matrix(
        #     papers_metadata, paper_instance_occurrence_matrix, papers, is_error_matrix
        # )

        # def create_year_paper_occurrence_matrix(
        #     papers_metadata, paper_instance_occurrence_matrix, papers, is_error_matrix=False
        # ):
        indexed_papers = {paper: i for i, paper in enumerate(papers)}
        for paper, instance in self.papers.items():
            if hasattr(instance, "year"):
                year = int(getattr(instance, "year"))
                if year not in self.year_papers:
                    self.year_papers[year] = {}
                self.year_papers[year][paper] = instance

        earliest = min(self.year_papers)
        latest = max(self.year_papers)
        span = latest - earliest + 1

        for year in range(earliest, latest):
            if year not in self.year_papers:
                self.year_papers[year] = []

        self.year_papers = {
            k: v for k, v in sorted(self.year_papers.items(), key=lambda item: item[0])
        }

        if is_error_matrix:
            # convert any value != 0 to 1
            paper_instance_occurrence_matrix = np.where(
                paper_instance_occurrence_matrix != 0, 1, 0
            )

        # create a year_instance_occurrence matrix from the paper_instance_occurrence_matrix
        year_instance_occurrence_matrix = np.zeros(
            (span, paper_instance_occurrence_matrix.shape[1]), dtype=int
        )
        for yearID, year in enumerate(self.year_papers):
            for paper in self.year_papers[year]:
                if paper in papers:
                    paperID = indexed_papers[paper]
                    year_instance_occurrence_matrix[
                        yearID
                    ] += paper_instance_occurrence_matrix[paperID]

    def build(self):
        self.build_matrix()


director.builder['year_instance_occurrence_matrix'] = YearPaperOccurrenceMatrixBuilder(director)
director.builder['year_instance_occurrence_matrix'].build()

# year_instance_occurrence_matrix, year_papers = create_year_paper_occurrence_matrix(
#     papers_metadata, paper_instance_occurrence_matrix, papers
# )

# Setup Complete

We now have:

| Variable                          | Type    | Size         | Comments |
|-----------------------------------|---------|--------------|----------|
| error_instances                   | list    | 165          | Comments |
| error_matrix                      | ndarray | (999, 165)   | Comments |
| error_papers                      | list    | 999          | Comments |
| gap_too_large_threshold           | int     | n.a.         | Comments |
| instance_piece_gap                | dict    | 151          | Comments |
| instance_types_dicts              | dict    | 5            | Comments |
| instances                         | list    | 315          | Comments |
| paper_full_text                   | dict    | 1029         | Comments |
| paper_instance_occurrence_matrix  | ndarray | (1003, 315)  | Comments |
| papers                            | list    | 1003         | Comments |
| pos_in_paper                      | dict    | 1003         | Comments |

Consisting of:
* The paper_instance_occurrence_matrix, binary listing if a term (instance) is present in a paper
  * papers x instances
* The error_matrix, of all instances that were dropped from the paper_instance_occurrence_matrix
  * error_papers x error_instances

And some leftover variables:
* instance_types_dicts, listing all instance types ("process", "software", ...) and their respective instance sets ("Curation", "Knowledge Work", ...)
* paper_full_text, containing each papers full text
  * pos_in_paper, listing for each paper: for each instance: each position of that instance in that papers full text.
* instance_piece_gap, a dict listing all instances made up from compound words (e.g. "Knowledge Work", and their minimum distance in each papers full text)
  * gap_too_large_threshold, defining how far appart a finding of "Knowledge" and "Work" would qualify as "Knowledge Work"

In [None]:
# ~3 min | {( len(papers) * len(instances) ) / (3 * 1000) }seconds  compare proximity of all instances with one antoher
# ~8 min right now.
# 3 min 30 sec with 164 papers and 339 instances
class ProximityMatrixBuilder(MatrixBuilder):
    def __init__(self, director:Director, instances = None, papers = None, pos_in_paper = None, mode = "sqrt"):
        super().__init__(director)

        self.instances:dict[str,Instance] = instances or director.instances
        self.papers:dict[str,Instance] = papers or director.papers
        self.pos_in_paper:PosInPaper = pos_in_paper or director.pos_in_paper

        self.mode = mode

    def build_matrix(self, instances = None, papers = None, pos_in_paper = None):
        instances = instances or self.instances
        papers = papers or self.papers
        pos_in_paper = pos_in_paper or self.pos_in_paper

        # self.matrix, self.proximity_instances = calculate_proximity_matrix(
        #     self.config, pos_in_paper, instances, mode="sqrt"
        # )

    def build(self):
        self.build_matrix()
        self.remove_zeros()
        self.instances = self.handle_deletions(self.instances)

    @time_function
    def build_matrix(self,
        # config: Config,
        # pos_in_paper: PosInPaper,
        # instances,
        # mode="sqrt",
        try_to_save_time=False,
    ):
        # TODO: Optimize this function.
        # each instance needs to have it's occurrences as pieces clustered together, so that only those below max distance are considered

        # create a np zeros matrix of size instances x instances
        indexed_instances = {instance: i for i, instance in enumerate(self.instances)}

        self.matrix = np.zeros(
            (len(self.instances), len(self.instances)), dtype=float
        )

        # alternatives are:
        # "sqrt" - 1 / (square root of the distance)
        # "linear" - 1 / distance
        # "binary" - 1 if distance < MAX_GAP_THRESHOLD, 0 otherwise
        # "log" - 1 / log(distance)

        # There is a chance that pos_in_paper papers and instances are out of sync with the current papers and instances
        paperIDs = [
            paperID for paperID, name in enumerate(pos_in_paper.papers) if name in self.papers
        ]
        lID_map = {
            indexed_instances[name]: instanceID
            for instanceID, name in enumerate(pos_in_paper.literals)
            if name in self.instances
        }

        for id1 in range(len(self.instances)):
            # print (f"Processing {id1} of {len(instances)}: {instance1}")
            for id2 in range(id1 + 1, len(self.instances)):
                # FIXME: this resulted in a matrix which was not symmetric.
                # That hints at a problem with the calclulation, [id1][id2] and [id2][id1] should be the same
                wcID = pos_in_paper.word_combination_index_literal_literal[lID_map[id1]][
                    lID_map[id2]
                ]
                for paperID in paperIDs:
                    distance = pos_in_paper.find_min_distance_by_id(paperID, wcID)

                    if distance < 0:
                        # print(f"Error: {instance1} and {instance2} not found in {paper}")
                        continue
                    result = 0.0
                    if distance == 0:
                        result = 1
                    elif distance == 1:
                        result = 1
                    elif self.mode == "sqrt":
                        result = 1 / np.sqrt(distance)
                    elif self.mode == "linear":
                        result = 1 / distance
                    elif self.mode == "binary":
                        result = 1 if distance < config.gap_too_large_threshold else 0
                    elif self.mode == "log":
                        result = 1 / np.log(distance)
                    else:
                        print("Error: unknown mode")
                        break
                    if result > 0.0:
                        self.matrix[id1][id2] += result
                        self.matrix[id2][id1] += result

        # TODO rest doesnt seem to work, short fix implemented:
        # create a copy of labels that only contains instances that are in the proximity matrix

        # instance_instance_proximity_matrix, deletions = remove_zeros(
        #     instance_instance_proximity_matrix
        # )
        # proximity_instances = handle_deletions(instances, deletions, rows=False)



In [None]:
director.pos_in_paper = pos_in_paper
director.builder['proximity_matrix'] = ProximityMatrixBuilder(director)
director.builder['proximity_matrix'].build()
# instance_instance_proximity_matrix, proximity_instances = calculate_proximity_matrix(
#     config, pos_in_paper, instances
# )

# Knowledge Graph creation

In [None]:
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori


def get_rules(matrix, columns):
    # AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'
    dataframe = pd.DataFrame(matrix, columns=columns).astype(bool)

    # for each process:
    # create one res

    res = apriori(dataframe, min_support=0.4, use_colnames=True, max_len=2)

    # visualize res
    res = res.sort_values(by="support", ascending=False)
    res = res.reset_index(drop=True)
    # res

    rules = association_rules(res)
    # sort rules by confidence
    # rules = rules.sort_values(by='confidence', ascending=False)
    rules = rules.sort_values(by="lift", ascending=False)  # (propably most important)
    # rules = rules.sort_values(by='leverage', ascending=False)
    # export rules to csv
    return rules


rules = get_rules(director.builder["occurrence_matrix"].matrix, list(director.instances.keys()))

In [None]:
# rules
process_dataframe(config, rules, "rules")

In [None]:
def identify_cross_type_rules(rules, director:Director):
    cross_type = [False] * len(rules)

    for i, antecentent in enumerate(rules.antecedents):
        if not isinstance(antecentent, str):
            (antecentent,) = antecentent
        consequent = rules.iloc[i].consequents
        if not isinstance(consequent, str):
            (consequent,) = consequent
        type1, type2 = None, None
        type1 = director.instances.get(antecentent, {}).get("instance_of", [None])[0]
        type2 = director.instances.get(consequent, {}).get("instance_of", [None])[0]
        # for type in director.classes:
        #     if antecentent in instance_types_dicts[type]:
        #         type1 = type
        #     if consequent in instance_types_dicts[type]:
        #         type2 = type
        #     if type1 and type2:
        #         break
        if type1 and type2 and type1 != type2:
            cross_type[i] = True
            # print(rules.iloc[i])

    # create a copy for all rules that are cross type
    rules_cross_type = rules[cross_type].copy()
    return rules_cross_type


rules_cross_type = identify_cross_type_rules(rules, director)

In [None]:
# def process_dataframe(config:Config, input_df, name = "some_df", path=None):
#     if path is None:
#         path = config.get_output_path()
#     filepath = os.path.join(path, name)

#     # convert all froensets to strings
#     for col in input_df.columns:
#         if isinstance(col[0], frozenset):
#             # input_df[col] = input_df[col].apply(lambda x: "_".join(x))
#             # input_df[col] = input_df[col].apply(lambda x: "_".join(x))
#             input_df[col] = input_df[col].apply(lambda x: x + "_HI!")
#             pass

#     input_df.to_csv(filepath + '.csv', sep=config.csv_separator, decimal=config.csv_decimal)
#     show(input_df)

# rules_cross_type = identify_cross_type_rules(rules)

process_dataframe(config, rules_cross_type, "rules_cross_type")
# cross_type_rules

In [None]:
kg_done = False


def print_kg_dict(config: Config, kg_dict, header):
    filepath = os.path.join(config.get_output_path(), "instance_relations.csv")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(header + "\n")
        total_comma = len(kg_dict) - 1
        for pos1, type1 in enumerate(kg_dict):
            preamble = "," * pos1
            for pos2, type2 in enumerate(kg_dict[type1]):
                intermediate = "," * (pos2 + 1)
                rest_comma = "," * (total_comma - pos1 - pos2)
                for i1, i2 in kg_dict[type1][type2]:
                    f.write(preamble + i1 + intermediate + i2 + rest_comma + "\n")


def knowledge_graph_population_cross_type_rules(
    config: Config, rules: association_rules, instance_types_dicts
):
    header = config.csv_separator.join(instance_types_dicts.keys())
    # Triangular dict
    dummy_dict = {}
    for instance_type in instance_types_dicts:
        dummy_dict[instance_type] = {}
        for type in instance_types_dicts:
            if type not in dummy_dict:
                dummy_dict[instance_type][type] = []
    for i, antecentent in enumerate(rules.antecedents):
        (antecentent,) = antecentent
        (consequent,) = rules.iloc[i].consequents
        first_type = None
        second_type = None
        for type in instance_types_dicts:
            if antecentent in instance_types_dicts[type]:
                # type1 = type
                if not first_type:
                    first_type = type
                    first_instance = antecentent
                else:
                    second_type = type
                    second_instance = antecentent
            if consequent in instance_types_dicts[type]:
                if not first_type:
                    first_type = type
                    first_instance = consequent
                else:
                    second_type = type
                    second_instance = consequent
            if first_type and second_type:
                break
        if first_type != second_type:
            dummy_dict[first_type][second_type].append(
                (first_instance, second_instance)
            )

    print_kg_dict(config, dummy_dict, header)

    return True


## Disabled. likely not needed anymore
# try:
#     kg_done = knowledge_graph_population_cross_type_rules(
#         config, rules_cross_type, director
#     )
# except Exception as e:
#     if config.debug:
#         raise e
#     else:
#         print(e)

In [None]:
# prepare csv file again
# process,software,data item,data model,data format specification,interchange format,data visualization,data validation,inference,source


@time_function
def knowledge_graph_population(
    config: Config,
    instance_types_dicts,
    property_types_dicts,
    instance_instance_proximity_matrix,
    proximity_instances,
):
    columns = list(instance_types_dicts.keys())
    # columns += list(property_types_dicts.keys())
    # columns = ['process', 'software', 'data item', 'data model', 'data format specification', 'data visualization', 'data validation', 'inference']

    rows = []
    for c_ID, column in enumerate(columns):
        for instance in instance_types_dicts[column]:
            # add the instance to the csv with each of their relations
            if instance not in proximity_instances:
                continue
            instance_index = proximity_instances.index(instance)
            for oc_ID, other_column in enumerate(columns):
                if other_column not in instance_types_dicts:
                    if other_column in property_types_dicts:
                        # TODO: handle properties specially
                        continue
                    continue
                if other_column != column:
                    other_column_instances = instance_types_dicts[other_column]
                    for other_instance in other_column_instances:
                        if other_instance not in proximity_instances:
                            continue
                        other_instance_index = proximity_instances.index(other_instance)
                        if (
                            instance_instance_proximity_matrix[instance_index][
                                other_instance_index
                            ]
                            > config.proximity_min_value
                        ):
                            # build row column by column
                            row = [""] * len(columns)
                            row[c_ID] = instance
                            row[oc_ID] = other_instance
                            rows.append(row)

    # write to csv
    filepath = os.path.join(config.get_output_path(), "instance_relations.csv")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(config.csv_separator.join(columns) + "\n")
        for row in rows:
            f.write(config.csv_separator.join(row) + "\n")
    return True

## Disabled. likely not needed anymore
# if not kg_done:
#     kg_done = knowledge_graph_population(
#         config,
#         instance_types_dicts,
#         property_types_dicts,
#         instance_instance_proximity_matrix,
#         proximity_instances,
#     )

In [None]:
from owlready2 import *
import pandas as pd
import types


# process,software,data item,data model,data format specification,interchange format,data visualization,data validation,inference,source
# process,software,data item,data model,data format specification


def save_as_owl(config: Config, path=None):
    onto_path = config.ontology_path
    df_cl = pd.read_csv(os.path.join(onto_path, "classes.csv"))
    df_re = pd.read_csv(os.path.join(onto_path, "relations.csv"))
    df_re = df_re.set_index("Domain\Range")
    if path is None:
        path = config.get_output_path()
    data_path = os.path.join(path, "instance_relations.csv")
    df = pd.read_csv(data_path)
    # df = pd.read_csv('data.csv')

    onto = get_ontology("http://tib.eu/slr")

    df_contributions = pd.read_csv(
        os.path.join(path, "paper_instance_occurrence_matrix.csv")
    )
    df_rules = pd.read_csv(os.path.join(path, "rules_cross_type.csv"))

    with open(os.path.join(path, "instance_types_dicts.json")) as file:
        inst_data = json.load(file)

    onto = get_ontology("http://tib.eu/slr")

    with onto:

        # Classes
        for ind, row in df_cl.iterrows():
            cl = types.new_class(row["URI"], (Thing,))
            cl.label = row["Label"]
            re = types.new_class(
                f'has{row["Label"].title().replace(" ", "")}', (ObjectProperty,)
            )
            re.label = f'has {row["Label"]}'

        # Instances
        for key, value in inst_data.items():
            cl = onto.search_one(label=key)
            if cl:
                for item in value:
                    inst = cl()
                    inst.label = item

        # Statements
        Contribution = types.new_class("Contribution", (Thing,))
        mentions = types.new_class("mentions", (ObjectProperty,))
        mentions.label = "mentions"
        for ind, row in df_contributions.iterrows():
            contrib_inst = Contribution()
            contrib_inst.label = row[0]
            for col in df_contributions.columns:
                if row[col]:
                    inst = onto.search_one(label=col)
                    if inst:
                        contrib_inst.mentions.append(inst)

        # Rules
        for ind, row in df_rules.iterrows():
            subj_inst = onto.search_one(label=row["antecedents"])
            obj_inst = onto.search_one(label=row["consequents"])
            if subj_inst and obj_inst:
                obj_cl = obj_inst.is_a[0]
                rel_label = f"has {str(obj_cl.label[0])}"
                rel = onto.search_one(label=rel_label)
                if rel:
                    rel[subj_inst].append(obj_inst)

    output_path = os.path.join(onto_path, "onto.owl")
    # onto.save('onto.owl')
    onto.save(output_path)
    onto.destroy()


# save_as_owl(config)

In [None]:
# prepare for ORKG
# header:
# paper:title,paper:authors,paper:publication_month,paper:publication_year,paper:published_in,paper:research_field,paper:doi,paper:url,contribution:research_problem,contribution:extraction_method,Property 1,Property 2


def flatten_nested_properties(data, pefix=""):
    res = {}
    for key, value in data.items():
        if isinstance(value, dict):
            res.update(flatten_nested_properties(value, f"{pefix}{key}:"))
        else:
            res[f"{pefix}{key}"] = value
    return res


class Paper:
    order = [
        "title",
        "authors",
        "publication_month",
        "publication_year",
        "published_in",
        "research_field",
        "doi",
        "url",
    ]

    def __init__(self, paperID, data={}):
        self.paperID: str = paperID
        self.title: str = data.get("title", "")
        ## now handled later
        # if self.title:
        #     self.title = '"' + self.title + '"'
        if self.title and "{" in self.title or "}" in self.title:
            self.title = self.title.replace("{", "").replace("}", "")
        self.authors: list[str] = data.get("author", "")
        if isinstance(self.authors, str):
            authors = self.authors.split("and ")
            if authors:
                for i, author in enumerate(authors):
                    name = author.split(",")
                    if len(name) > 1:
                        name = f"{name[1].strip()} {name[0].strip()}"
                    else:
                        name = name[0].strip()
                    authors[i] = name
            self.authors = "; ".join(authors)
        self.publication_month: int = data.get("publication_month", "")
        self.publication_year: int = data.get("year", "")
        self.published_in = ""
        for key in ["journal", "conference", "journal"]:
            if key in data:
                self.published_in = data[key]
                break
        self.research_field: str = data.get("research_field", "")
        if not self.research_field:
            # TODO: find a way to get the research field
            self.research_field = "R195"
        self.doi: str = data.get("doi", "")
        self.url: str = data.get("url", "")


class Contribution:
    def __init__(self, paperID, properties={}):
        self.paperID: str = paperID
        self.properties: dict = flatten_nested_properties(properties)


class ORKGComparison:
    def __init__(self):
        self.papers = {}  # paperID:paper data
        self.contibutions: dict[str:Contribution] = {}  # paperID:contribution data
        self.properties = {}

    def populate(
        self,
        config: Config,
        papers,
        instances,
        instance_types_dicts,
        paper_instance_occurrence_matrix,
        papers_metadata,
    ):
        # Create a dictionary to hold the count of existing values for each property
        property_ranges = {
            property: sum(
                value in instances for value in values
            )  # Count how many values exist in 'instances'
            for property, values in instance_types_dicts.items()  # Iterate over each property and its values
        }
        # floor = 0
        # for prop, value in property_ranges.items():
        #     property_ranges[prop] += floor
        #     floor += value

        for paperID, paper in enumerate(papers):
            paper_data = papers_metadata.get(paper, {})
            self.papers[paperID] = Paper(paper, paper_data)
            properties = {prop: [] for prop in property_ranges}
            floor = 0
            for prop, prop_range in property_ranges.items():
                for i in range(floor, floor + prop_range):
                    if paper_instance_occurrence_matrix[paperID][i] == 1:
                        properties[prop].append(instances[i])
                floor += prop_range
            self.contibutions[paperID] = Contribution(paper, properties)
        return self

    def populate_properties(self):
        for contribution in self.contibutions.values():
            for prop, value in contribution.properties.items():
                len_values = len(value) if isinstance(value, list) else 1
                if prop not in self.properties or self.properties[prop] < len_values:
                    self.properties[prop] = len_values
        return self.properties

    def get(self, key):
        if key == "properties" and not self.properties:
            self.populate_properties()
        return getattr(self, key)

    def save(self, config: Config, path=None, name="orkg_comparison"):
        if path is None:
            path = config.orkg_path
        filepath = os.path.join(path, name)
        if not filepath.endswith(".csv"):
            filepath += ".csv"

        rows = []
        row = ["paper:" + prop for prop in Paper.order]
        for prop, count in self.get("properties").items():
            # row += [f"contribution:{prop}"] * count
            row += [prop] * count
        rows.append(row)

        for paperID, contribution in self.contibutions.items():
            paper = self.papers[paperID]
            row = [getattr(paper, key, "") for key in Paper.order]
            for prop, count in self.properties.items():
                value = contribution.properties.get(prop, "")
                if not isinstance(value, list):
                    value = [value]
                len_taken = len(value)
                if len_taken < count:
                    value += [""] * (count - len_taken)
                row += value
            rows.append(row)

        with open(filepath, "w", encoding="utf-8") as f:
            for row in rows:
                for id, item in enumerate(row):
                    if config.csv_separator in item:
                        if item.startswith('"') and item.endswith('"'):
                            continue
                        row[id] = '"' + item + '"'
                f.write(config.csv_separator.join(row) + "\n")


# orkg_comparison = ORKGComparison()
# orkg_comparison.populate(
#     config,
#     papers,
#     instances,
#     instance_types_dicts,
#     paper_instance_occurrence_matrix,
#     papers_metadata,
# )
# orkg_comparison.save(config)

# Output

In [None]:
for name, builder in director.builder.items():
    if not hasattr(builder, "matrix"):
        continue
    if name in ["year_instance_occurrence_matrix"]:
        continue
    rows = []
    candidates = ["papers", "instances"]
    for candidate in candidates:
        if hasattr(builder, candidate):
            rows = getattr(builder, candidate)
            break
    if not rows:
        raise Exception(f"Could not find rows for {name}")
    if isinstance(rows, dict):
        rows = list(rows.keys())
    
    cols = []
    candidates = ["instances", "literals"]
    for candidate in candidates:
        if hasattr(builder, candidate):
            cols = getattr(builder, candidate)
            break
    if not cols:
        raise Exception(f"Could not find cols for {name}")
    if isinstance(cols, dict):
        cols = list(cols.keys())
    
    builder.save()
    # process_matrix(director.config, builder.matrix, rows, cols, name)

In [None]:
# config.visualize = True

## Functions

## Files

In [None]:
# process_list(config, instances, "instances")

In [None]:
# All Dicts: instance_types_dicts, papers_metadata, instance_piece_gap
# process_dict(config, instance_types_dicts, "instance_types_dicts")
# process_dict(config, papers_metadata, "papers_metadata")

### Rules

### Paper x Instance

In [None]:
# process_matrix(
#     config,
#     paper_instance_occurrence_matrix,
#     rows=papers,
#     columns=instances,
#     name="paper_instance_occurrence_matrix",
# )

In [None]:
# process_matrix(
#     config,
#     error_matrix,
#     rows=error_papers,
#     columns=error_instances,
#     name="error_matrix",
# )

### Instance x Instance

In [None]:
# process_matrix(
#     config,
#     instance_instance_co_occurrence_matrix,
#     rows=instances,
#     columns=instances,
#     name="instance_instance_co_occurrence_matrix",
# )

In [None]:
# process_matrix(
#     config,
#     instance_instance_relative_co_occurrence_matrix,
#     rows=instances,
#     columns=instances,
#     name="instance_instance_relative_co_occurrence_matrix",
# )

In [None]:
# process_matrix(
#     config,
#     instance_instance_proximity_matrix,
#     rows=proximity_instances,
#     columns=proximity_instances,
#     name="instance_instance_proximity_matrix",
#     instance_types_dicts=instance_types_dicts,
# )

# Approach

## Pre-Processing
Using Completion Rating in %

### 80 %: Full Text extraction
* lacking noise removal (Headings, page numbers, ...)
* lacking line-break mending

### 100 %: Bag of Words
* The problem with BoW that the words are looked at seperatly and correlation is not really clear.


### 99 %: TF-IDF
* tf-idf only on terms

### ? %: Part Of Speech (POS) Tagging, Named Entity Recognition (NER) 
* ready, but not used currently

## Visualize

### 85 % Matrix
* CSV and Dataframe dumps work fine
* Visualization as PNG or SVG are extremely large.
  * DPI regulation works to somewhat keep this in check, but images still reach 20 MB
* An interactive matrix would be preferred.
  * If you hover on a cell, it shows you the x and y label and it's value.

### 100 % Timeline
* arrange the papers on a timeline and identify the flow of:
  * Processes
  * File formats
  * software
  * ...
* Additional ideas:
  * Compare this to goolge trends

### GraphDB
* Visualize

## Future Work
Using Difficulty ranked (DR) solutions:

### Step 0: Look it up

#### Wikidata linking & more
* https://openrefine.org/

#### More visualization
* https://github.com/JasonKessler/scattertext 
* https://pypi.org/project/yellowbrick/

#### NLP Pipelines:
https://spacy.io/usage/processing-pipelines


#### BLAST: Basic Local Alignment Search Tool
  * starting point: https://academic.oup.com/bioinformatics/article/39/12/btad716/7450067

#### AMIE 3
  * https://luisgalarraga.de/docs/amie3.pdf
  * https://github.com/dig-team/amie

### Step 1: Low hanging fruits

#### 1/5 DR: multi-word detection (n-gram)
Tools:  nltk, spaCy, etc.

### Step 2: Not-to-tricky follow-up

#### 3/5 DR: Acronym Expansion
Tools: spaCy - https://spacy.io/universe/project/neuralcoref

#### 3/5 DR: CoReference resolution
Tools: spaCy - https://spacy.io/universe/project/neuralcoref or https://huggingface.co/coref/ (you can use the model out of the box)

### Step 3: Vector-magic

#### 2-4/5 DR: Word embedding
* Find out, that jpeg and png are similar

(depending on your needs) - Tools: gensim - https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/

#### 3/5 DR: document embedding
Tools: gensim - https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

I would also check graph embeddings, sentence embeddings, and recently there is LLM2Vec

### Step 3.1: Reaping the vector-rewards

#### 1/5 DR: clustering
Tools: sklearn

Requirements: Need to have data as numbers first. This is quite possible after generating embeddings

### Step 9: Won't be happening in this paper
* Paper classes
* Subclasses of paper classes
* model which process is a subprocess of another process