# Evaluate Instance occurrence in Papers

## Setup

In [1]:
from bnw_tools.SLR.config import Config

config = {
    "for_git": True,
    "visualize": True,
    "csv_separator": ",",
    "csv_decimal": ".",
    ## Should only accepted papers be used for the analysis?
    "only_included_papers": False,
    ## Which instance columns actually indicate properties?
    "properties": ["source"],
    "proximity_mode": "sqrt",
    ## Paths
    "base_path": "data/",
    "subset_path": "data_subset/",
    "visualization_path": "visualization/",
    "ontology_path": "ontology/",
    "orkg_path": "ORKG/",
    "folder_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR",
    "papers_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/02_nlp",
    "review_path": "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/03_notes",
    "csv_file": "C:/workspace/borgnetzwerk/tools/scripts/SLR/data.csv",
    "obsidian_path": "ontology/obsidian/",
    ## Position in Paper settings
    "gap_too_large_threshold": 100,
    "savetime_on_fulltext": False,
    "try_to_save_time": False,
    "recalculate_pos_in_paper": False,
    "debug": True,
    ## Wikidata settings
    "wikidata_query_limit": 20,
    ## Graph settings
    "proximity_seed": 17,
    "proximity_k_spring": 18,
    "proximity_min_value": 0.1,
}


# config = Config(**config)
config = Config(config)

## Functions

In [None]:
from bnw_tools.SLR.builder import *
obsidian_folder = ObsidianFolder(config=config)

classes = {label: _class for label, _class in obsidian_folder.classes.items()}
instances = {label: instance for label, instance in obsidian_folder.instances.items()}
papers = {label: paper for label, paper in obsidian_folder.papers.items()}

order = ["process", "software", "data item", "data model", "data format specification"]
pos_in_order = {o: i for i, o in enumerate(order)}
instances_by_class = {o: {} for o in order}

for label, instance in instances.items():
    instance_of = instance.get("instance_of", [])
    if not instance_of:
        continue
    positions = [pos_in_order[i] for i in instance_of if i in pos_in_order]
    if not positions:
        print(f"Instance {label} has no class other than {instance_of}")
        continue
    class_ = order[min(positions)]
    instances_by_class[class_][label] = instance

In [3]:
from owlready2 import *

# Load the OWL file
path = "C:/workspace/borgnetzwerk/tools/scripts/SLR/ontology/slr.owl"
onto = get_ontology(path).load()

In [4]:
ontology_class_mapping = {
    "data format specification": 'http://purl.obolibrary.org/obo/IAO_0000098',
    "data item": 'http://purl.obolibrary.org/obo/IAO_0000027',
    "data model":'https://www.wikidata.org/wiki/Q1172480',
    "process": 'http://purl.obolibrary.org/obo/BFO_0000015',
    "software": 'http://www.ebi.ac.uk/swo/SWO_0000001',
}
ontology_class_mapping_inv = dict(zip(ontology_class_mapping.values(), ontology_class_mapping.keys()))



data_format_specification = onto['http://purl.obolibrary.org/obo/IAO_0000098']
data_item = onto['http://purl.obolibrary.org/obo/IAO_0000027']
data_model = onto['https://www.wikidata.org/wiki/Q1172480']
process = onto['http://purl.obolibrary.org/obo/BFO_0000015']
software = onto['http://www.ebi.ac.uk/swo/SWO_0000001']

In [None]:
import warnings
instances_new = {individual.name: individual for individual in onto.individuals() if onto['Contribution'] not in individual.is_a}

instances_new_formalized = {}
instance_label_dict = {}
for instance_name, instance_new in instances_new.items():
    label = instances_new[instance_name].label[0]
    # if label not in instances:
    #     continue
    # instance_old = instances[label]

    formal_instance = Instance()
    formal_instance.label = label
    formal_instance.source = getattr(instance_new, "source", [])
    # for eID, entity in enumerate(formal_instance.source):
    #     if str(entity) in ontology_class_mapping_inv:
    #         formal_instance.source[eID] = ontology_class_mapping_inv[entity]


    formal_instance.instance_of = getattr(instance_new, "is_instance_of", [])
    new_list = []
    for eID, entity in enumerate(formal_instance.instance_of):
        if hasattr(entity, 'name') and entity.name in ontology_class_mapping_inv:
            new_list.append(ontology_class_mapping_inv[entity.name])
        else:
            if isinstance(entity, str):
                new_list.append(entity)
            else:  
                warnings.warn(formal_instance.label + " has an unknown instance_of:\n" + str(entity))
    formal_instance.instance_of = new_list

    formal_instance.aliases = getattr(instance_new, "aliase", [])
    formal_instance.wikidata_uri = getattr(instance_new, "wikidata_uri", [])
    formal_instance.orkg_uri = getattr(instance_new, "orkg_uri", [])
    

    text = formal_instance.label
    if formal_instance.instance_of:
        try:
            text = ", ".join(formal_instance.instance_of) + ": " + formal_instance.label
        except Exception as e:
            print(formal_instance.label)
            print(formal_instance.__dict__)
            raise e
    if formal_instance.aliases:
        text += " (aka " + " | ".join(formal_instance.aliases) + ")"
    if formal_instance.source:
        text += " [" + ", ".join(formal_instance.source) + "]"
    print(text)
    instances_new_formalized[instance_name] = formal_instance
    instance_label_dict[instance_name] = label

In [None]:
## Shifting from old to new instances
print("Old instances:", len(instances))
print("New instances:", len(instances_new_formalized))

# print("Instances that were removed:", len(set(instances.keys()) - set(instances_new_formalized.keys())))
# print("Instances that were added:", len(set(instances_new_formalized.keys()) - set(instances.keys())))
# print("Instances that were kept:", len(set(instances_new_formalized.keys()).intersection(set(instances.keys()))))

instances_old = instances
instances = instances_new_formalized

order = ["process", "software", "data item", "data model", "data format specification"]
pos_in_order = {o: i for i, o in enumerate(order)}
instances_by_class = {o: {} for o in order}

for label, instance in instances.items():
    instance_of = instance.get("instance_of", [])
    if not instance_of:
        continue
    positions = [pos_in_order[i] for i in instance_of if i in pos_in_order]
    if not positions:
        print(f"Instance {label} has no class other than {instance_of}")
        continue
    class_ = order[min(positions)]
    instances_by_class[class_][label] = instance

#### Reduce to Reviewed papers

In [7]:
def sort_papers(papers, reverse=False):
    papers = {
        x: papers[x]
        for x in sorted(
            papers,
            key=lambda x: (
                getattr(papers[x], "year", "9999")
                if hasattr(papers[x], "year")
                else "9999"
            ),
            reverse=reverse,
        )
    }
    return papers

def reduce_to_reviewed_papers(papers, config):
    review_path = config.review_path
    # todo: sort by review score + average rank
    ## TODO: Make this a function that imports more data from the reivew files
    included_identifier = {
        3: "review_score:: 3",
        4: "review_score:: 4",
        5: "review_score:: 5",
    }
    excluded_identifier = {
        2: "review_score:: 2",
        1: "review_score:: 1",
        0: "review_score:: 0",
    }
    included_papers = {}
    excluded_papers = {}
    for file in os.listdir(review_path):
        if file.endswith(".md"):
            paper_name = file[:-3]
            if paper_name in papers:
                paper = papers[paper_name]
                if (
                    paper_name in included_papers
                    or paper_name in excluded_papers
                ):
                    continue
                # check if file contains "reviewed"ArithmeticError
                with open(
                    os.path.join(review_path, file), "r", encoding="utf8"
                ) as f:
                    content = f.read()
                    for score, text in included_identifier.items():
                        if text in content:
                            paper.review_score = score
                            included_papers[paper_name] = paper
                            break
                    for score, text in excluded_identifier.items():
                        if text in content:
                            excluded_papers[paper_name] = paper
                            break
    if config.only_included_papers:
        papers = {k: v for k, v in included_papers.items()}
    papers = sort_papers(papers)
    if included_papers:
        included_papers = sort_papers(included_papers)
    if excluded_papers:
        excluded_papers = sort_papers(excluded_papers)
    return papers, included_papers, excluded_papers

papers, included_papers, excluded_papers = reduce_to_reviewed_papers(papers, config)

In [8]:
output = {}
for name, instance in papers.items():
    output[name] = instance.__dict__
with open("papers.json", 'w', encoding='utf8') as json_file:
    json.dump(output, json_file, ensure_ascii=False, indent=4)

## Instance Setup done.
Proceeding to:

## Matrix calculations

### Instance Occurrence Matrix

In [9]:
# def count_occurrences( papers, instances):
#     occurrences = np.zeros((len(papers), len(instances)), dtype=int)

#     for p, paperpath in enumerate(papers.values()):
#         if isinstance(paperpath, dict) or isinstance(paperpath, Instance):
#             paperpath = paperpath.get("nlp_path", None)
#         with open(paperpath, "r", encoding="utf8") as f:
#             paper = json.load(f)
#             for i, instance in enumerate(instances):
#                 present = True
#                 pieces = split_string(instance)
#                 for piece in pieces:
#                     if piece.lower() not in paper["bag_of_words"]:
#                         present = False
#                         break

#                 if present:
#                     occurrences[p][i] = 1
#     return occurrences


def count_occurrences(papers, instances:dict[str,Instance]):
    # Create an empty DataFrame with papers as rows and instances as columns
    df = pd.DataFrame(0, index=papers.keys(), columns=instances.keys(), dtype=int)

    for paper_id, paperpath in papers.items():
        if isinstance(paperpath, dict) or isinstance(paperpath, Instance):
            paperpath = paperpath.get("nlp_path", None)
        with open(paperpath, "r", encoding="utf8") as f:
            paper = json.load(f)
            for instance_iri, instance in instances.items():
                instance_label = instance.get("label", instance_iri)
                
                ### NONE IN INSTACNES

                label = instance.get("label", None)
                aliases = instance.get("aliases", [])
                candidates = set().union([label], [instance_label], aliases)
                if None in candidates:
                    print("Instance without label found:")
                    print(json.dumps(instance.__dict__, indent=4))
                    raise ValueError("Instance without label found")
                candidates.discard(None)

                for name in candidates:
                    present = True
                    pieces = split_string(name)
                    for piece in pieces:
                        if piece.lower() not in paper["bag_of_words"]:
                            present = False
                            break
                    if present:
                        df.at[paper_id, instance_iri] = 1
                        break
    return df

occurrence_matrix = count_occurrences(papers, instances)

In [10]:
def remove_zeros(matrix:pd.DataFrame, columns=True, rows=True):
    deleted_columns, deleted_rows = np.array([]), np.array([])

    if columns:
        deleted_columns = matrix.columns[(matrix == 0).all(axis=0)]
        matrix = matrix.loc[:, (matrix != 0).any(axis=0)]

    if rows:
        deleted_rows = matrix.index[(matrix == 0).all(axis=1)]
        matrix = matrix.loc[(matrix != 0).any(axis=1), :]

    deletions = [deleted_columns, deleted_rows]
    return matrix, deletions

def sort_instances(instances, instances_by_class, matrix:pd.DataFrame):
    indexed_instances = {
        instance: i for i, instance in enumerate(instances.keys())
    }

    instance_occurrences = {instance_label:sum(matrix[instance_label]) for instance_label in instances.keys()}

    sorted_instances = {
        k: float(v)
        for k, v in sorted(
            instance_occurrences.items(), key=lambda item: item[1], reverse=True
        )
    }

    filepath = os.path.join(config.get_output_path(), "instance_occurrences")
    with open(filepath + ".json", "w", encoding="utf-8") as f:
        json.dump(sorted_instances, f, ensure_ascii=False, indent=4)

    sorted_instance_list = list(sorted_instances.keys())

    type_lists = [[] for _ in range(len(instances_by_class))]
    for instance in sorted_instance_list:
        for type_ID, instance_type in enumerate(instances_by_class):
            if instance in instances_by_class[instance_type]:
                type_lists[type_ID].append(instance)
    type_sorted_instances = [item for sublist in type_lists for item in sublist]

    # new_order = [0] * len(sorted_instance_list)
    sorted_instances = {}
    # for i, instance in enumerate(type_sorted_instances):
    for instance in type_sorted_instances:
        # new_order[i] = indexed_instances[instance]
        sorted_instances[instance] = instances[instance]

    # sort all matrixes accordingly
    matrix = matrix.reindex(columns=sorted_instances)
    
    instances = sorted_instances
    
    return instances, matrix

instances, occurrence_matrix = sort_instances(instances, instances_by_class, occurrence_matrix)

In [None]:
occurrence_matrix

In [None]:
occurrence_matrix.sum()

In [13]:
def remove_zeros_np(matrix, columns=True, rows=True):
    # remove all columns that are all zeros
    deleted_columns, deleted_rows = np.array([]), np.array([])

    if columns:
        deleted_columns = np.all(matrix == 0, axis=0)
        matrix = matrix[:, ~np.all(matrix == 0, axis=0)]

    # remove all rows that are all zeros
    if rows:
        deleted_rows = np.all(matrix == 0, axis=1)
        matrix = matrix[~np.all(matrix == 0, axis=1)]

    deletions = [deleted_columns, deleted_rows]
    return matrix, deletions

def reorder_matrix_np(matrix:np.array, new_order, cols=True):
    if cols:
        matrix = matrix[:, new_order]
    else:
        matrix = matrix[new_order, :]
    remove_zeros_np(matrix)

In [14]:
# get all text files
def get_paper_full_text(directory, papers:dict[str,Instance]=None):
    paper_full_text = {}
    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.endswith(".txt"):
                    file_path = os.path.join(folder_path, file)
                    paper_full_text[file[:-4]] = file_path
                    break
    if papers:
        for paper_label, paper in papers.items():
            paper.__setattr__("full_text_path", paper_full_text.get(paper_label, None))
        return papers
    else:
        return paper_full_text

papers = get_paper_full_text(
    "G:/Meine Ablage/SE2A-B42-Aerospace-knowledge-SWARM-SLR/00_PDFs", papers
)

In [15]:
# def replace_nan(df:pd.DataFrame, col, what):
#     # if isinstance(col, list):
#     #     for c in col:
#     #         replace_nan(df, c, what)
#     #     return df
#     # nans = df[col].isnull()
#     # df.loc[nans, col] = [what for isnan in nans.values if isnan]
#     idx = df.isna()
#     if idx.empty:
#         return df
#     df.where(idx, what, inplace=True)
#     # df.loc[idx, col] = [what] * idx.sum()
#     # df.iloc[idx, col] = what
#     return df

from ast import literal_eval


def replace_nan_with_empty_list(df: pd.DataFrame) -> pd.DataFrame:
    return df.map(lambda x: [] if pd.isna(x) else x)


class PosInPaper:
    def __init__(
        self,
        config,
        papers: dict[str:Instance] = None,
        instances: dict[str:Instance] = None,
        save = True
    ):
        self.mode = getattr(config, "search_in_text_mode", "lower")
        self.config = config
        self.words = []
        self.papers: dict[str:Instance] = {}
        self.instances: dict[str:Instance] = {}

        self.matches = {}

        self.word_occurrences_in_papers = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.words, dtype=object
        )

        self.word_combinations = {}
        self.word_combination_min_distance = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.word_combinations.keys(), dtype=int
        )

        self.instance_word_combinations = {k: [] for k in self.instances.keys()}
        self.instance_min_distance_in_papers = pd.DataFrame(
            [[]], index=self.papers.keys(), columns=self.instances.keys(), dtype=int
        )


        self.load()
        changes = False
        if papers:
            updated = self.update_papers(papers)
            if updated:
                changes = True
        if instances:
            updated = self.update_instances(instances)
            if updated:
                changes = True
            # self.update_instances(dict(list(instances.items())[0:2]))

        if save:
            self.save()

    def save(self):
        # write all data to json, except dataframes to csv
        data = {}
        for key, value in self.__dict__.items():
            if key in ["papers", "instances", "config"]:
                continue
            if isinstance(value, pd.DataFrame):
                if value.empty:
                    continue
                try:
                    if isinstance(value.iloc[0, 0], list):
                        # # data[key].to_json(key+'.json', orient='records', lines=True)
                        # data[key].to_json(key+'.json')
                        # # with open(f"data/{key}.json", "w", encoding="utf-8") as f:
                        # #     json.dump(data[key].to_json(orient="split"), f, ensure_ascii=False, indent=4)
                        result = value.to_json(orient="split")
                        parsed = json.loads(result)
                        # json.dumps(parsed, indent=4)
                        path = f"{self.config.get_output_path()}/{key}.json"
                        with open(path, "w", encoding="utf-8") as f:
                            json.dump(parsed, f, ensure_ascii=False)
                    else:
                        path = f"{self.config.get_output_path()}/{key}.csv"
                        with open(path, "w", encoding="utf-8") as f:
                            value.to_csv(f, lineterminator='\n')
                except Exception as e:
                    print(f"Error saving {key}: {e}")
                continue
            elif isinstance(value, dict):
                data[key] = {}
                for k, v in value.items():
                    if isinstance(v, Instance):
                        data[key][k] = v.__dict__
                    if isinstance(v, set):
                        data[key][k] = list(v)
            elif key == "word_combinations":
                data[key] = [list(x) for x in value]
            elif key == "matches":
                data[key] = {k: list(v) for k, v in value.items()}
            else:
                data[key] = value
        path = f"{self.config.get_output_path()}/pos_in_paper.json"
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

    def load(self):
        # TODO: Load Dataframes, sets, Instance, dicts of anything
        for key, value in self.__dict__.items():
            path = f"{self.config.get_output_path()}/{key}."
            csv_path = path + "csv"
            json_path = path + "json"
            if isinstance(value, pd.DataFrame):
                if value.empty:
                    # do not overwrite
                    continue
                if os.path.exists(csv_path):
                    value = pd.read_csv(csv_path, index_col=0)
                    # data[key] = data[key].applymap(literal_eval)
                    for col in data[key].columns:
                        value[col] = value[col].apply(literal_eval)
                elif os.path.exists(json_path):
                    with open(json_path, "r", encoding="utf-8") as f:
                        value = pd.read_json(f, orient="split")

            else:
                if value:
                    # do not overwrite
                    continue
                elif os.path.exists(json_path):
                    with open(json_path, "r", encoding="utf-8") as f:
                        data[key] = json.load(f)
        path = f"{self.config.get_output_path()}/pos_in_paper.json"
        if os.path.exists(path):
            with open(path, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError as e:
                    print(f"Error loading data: {e}")
                    return
                for key, value in data.items():
                    if key == "word_combinations":
                        value = {frozenset(x): i for i, x in enumerate(value)}
                    current = getattr(self, key)
                    if isinstance(current, pd.DataFrame):
                        continue
                    elif isinstance(current, dict):
                        if key in ["papers", "instances"]:
                            continue
                        if key == "matches":
                            for k, v in value.items():
                                if k not in self.matches:
                                    self.matches[k] = set()
                                self.matches[k].update(v)
                            continue
                    if not current:
                        setattr(self, key, value)

    # def get_word_combinations_from_instances(self):
    #     word_combinations = list(self.word_combinations.keys())

    def reindex(self):
        self.word_occurrences_in_papers = self.word_occurrences_in_papers.reindex(
            self.papers.keys(),
            columns=self.words
        )
        self.word_combination_min_distance = self.word_combination_min_distance.reindex(
            self.papers.keys(),
            columns=self.word_combinations.keys()
        )
        self.instance_min_distance_in_papers = self.instance_min_distance_in_papers.reindex(
            self.papers.keys(),
            columns=self.instances.keys()
        )

    def update_papers(self, papers: dict[str:Instance]):
        changes = False
        for paper_label, paper in papers.items():
            if paper_label not in self.papers:
                self.papers[paper_label] = paper
                changes = True
            self.papers = sort_papers(self.papers)
            self.reindex()
        return changes

    def update_instances(self, instances: dict[str, Instance]):
        changes = False
        for instance_label, instance in instances.items():
            if instance_label not in self.instances:
                self.instances[instance_label] = instance
                changes = True
        if changes:
            for instance_label, instance in instances.items():
                # candidates = instance.get_all_names().union([instance_label])
                candidates = instance.get_all_names()
                candidate_words = self.update_words(candidates)
                for words in candidate_words.values():
                    frozenset_words = frozenset(words)
                    pos = len(self.word_combinations)
                    if frozenset_words not in self.word_combinations:
                        self.word_combinations[frozenset_words] = pos
                    else:
                        pos = self.word_combinations[frozenset_words]
                    if instance_label not in self.instance_word_combinations:
                        self.instance_word_combinations[instance_label] = []
                    if pos not in self.instance_word_combinations[instance_label]:
                        self.instance_word_combinations[instance_label].append(pos)
            self.reindex()
        return changes

    def update_words(self, words):
        res = {candidate: [] for candidate in words}
        for candidate in words:
            for word in split_string(candidate):
                if self.mode == "lower":
                    word = word.lower()
                remove_chars = ["(", ")", "[", "]", "{", "}", ":", ";", ",", "."]
                remove_chars += ["'", '"', "’", "‘", "”", "“", "´", "`", "´´", "``"]
                for char in remove_chars:
                    word = word.replace(char, "")
                if word not in self.words:
                    self.words.append(word)
                res[candidate].append(word)
        self.words.sort()
        self.matches.update({word: set() for word in self.words})
        return res

    def find_occurrences_in_texts(self, save = True):
        self.word_occurrences_in_papers = replace_nan_with_empty_list(
            self.word_occurrences_in_papers
        )
        discarded_matches = {}
        for pID, (paper_label, paper) in enumerate(self.papers.items()):
            if pID % (len(self.papers.keys()) // 10) == 0:
                print(f"Progress: " + str(pID) + "/" + str(len(self.papers.keys())) + " papers")
            if hasattr(paper, "full_text_path") and paper.full_text_path:
                with open(paper.full_text_path, "r", encoding="utf8") as f:
                    full_text = f.read()

                    # Pre-processing:
                    # Reduce text to only letters and numbers
                    full_text = re.sub(r"[^a-zA-Z0-9 ]", " ", full_text)
                    # Reduce multiple whitespaces to single whitespace
                    full_text = re.sub(r"\s+", " ", full_text)
                    
                    if self.mode == "lower":
                        full_text = full_text.lower()

                    # TODO: Work with the idea of word based lookup, not character based lookup
                    # If we lookup words, we loose "Wolfram&Heart" occurences of "&" and "engineer" in "engineers"
                    # If we lookup characters, we need to keep track of the length of the words to have a meaningful distance
                    # words = split_string(full_text)
                    # for wID, word in enumerate(words):
                    #     if word in self.words:
                    #         self.word_occurrences_in_papers.at[paper_label, word].append(wID)

                    for word in self.words:
                        # check exact matches with regex first:
                        # regex_pattern = r'\b[ _-]{}[-_ (s)(ing)(d)(ed)]+\b'
                        # pattern = re.compile(regex_pattern.format(re.escape(word)), re.IGNORECASE)
                        # positions = [match.start() for match in pattern.finditer(full_text)]

                        if self.word_occurrences_in_papers.at[paper_label, word]:
                            # if self.word_occurrences_in_papers.at[paper_label, word] == positions:
                            continue
                        if self.mode == "lower":
                            word = word.lower()
                        pos = full_text.find(word)
                        # if pos == -1:
                        #     self.word_occurrences_in_papers.at[paper_label, word] = None
                        while pos != -1:
                            # if pos not in positions:
                            #     if word not in discarded_matches:
                            #         discarded_matches[word] = set()
                            
                            correct = True
                            
                            if pos != 0:
                                if full_text[pos - 1].isalpha():
                                    correct = False
                            if pos + len(word) != len(full_text):
                                if full_text[pos + len(word)].isalpha():
                                    correct = False
                            
                            if correct:
                                # match is valid
                                self.word_occurrences_in_papers.at[
                                    paper_label, word
                                ].append([pos, word])
                            else:
                                # lower = max(pos - 20, 0)
                                # upper = min(pos + 20 + len(word), len(full_text))
                                # discarded_match = full_text[lower:upper]
                                # discarded_matches[word].add(discarded_match)
                                pass
                            

                            next = full_text.find(" ", pos + 1)
                            previous = full_text.rfind(" ", 0, pos)
                            self.matches[word].add(full_text[previous:next])

                            pos = full_text.find(word, pos + 1)
                        
        if save:
            self.save()

    def find_min_distance_by_id(self, paper_label, word_combination):
        distance = self.word_combination_min_distance.at[paper_label,word_combination]

        if distance == -1:
            # word combination not found in paper
            return -1
        if distance == -2 or pd.isna(distance):
            # calculate distance
            pass
        else:
            return distance

        # list_ids = self.word_combination_lists[wcID]
        words = list(word_combination)
        if len(words) == 1:
            occurrences = self.word_occurrences_in_papers.at[paper_label,words[0]]
            if not occurrences:
                self.word_combination_min_distance.at[paper_label,word_combination] = -1
                return -1
            else:
                self.word_combination_min_distance.at[paper_label,word_combination] = 0
                return 0
        # since we have attached global Word IDs to the occurrences, we need to map to their local position
        list_ids_map = {words[i]: i for i in range(len(words))}
        # literals = [list(self.words)[i] for i in list_ids]

        lit_len = [len(i) for i in words]

        for word in words:
            if not self.word_occurrences_in_papers.at[paper_label,word]:
                self.word_combination_min_distance.at[paper_label,word_combination] = -1
                return -1
        # Outsourced to optimize
        # inputs = [[(x, i) for x in self.word_occurrences_in_papers[paperID][wordID]] for i, wordID in enumerate(list_ids)]
        inputs = [
            self.word_occurrences_in_papers.at[paper_label,word] for word in words
        ]

        indices = [lst[0][0] for lst in inputs]
        best = float("inf")

        for item in sorted(sum(inputs, [])):
            if item[0] not in indices:
                continue
            # indices[list_ids_map[item[1]]] = item[0]
            indices[list_ids_map[item[1]]] = item[0]
            arr_min = min(indices)
            best = min(max(indices) - arr_min - lit_len[indices.index(arr_min)], best)
            if best <= 0:
                best = 0
                break
        self.word_combination_min_distance.at[paper_label,word_combination] = best

        return best
    
    def find_all_combinations(self, save = True):

        for paper_label in self.papers:
            for combination in self.word_combinations:
                self.find_min_distance_by_id(paper_label, combination)
        if save:
            self.save()

    def update_instance_min_distances(self, save = True):
        combination_list = list(self.word_combinations.keys())
        for paper_label in self.papers:
            for instance_label, combinations in self.instance_word_combinations.items():
                dist = -1
                for combination in combinations:
                    combination = combination_list[combination]
                    distance = self.find_min_distance_by_id(paper_label, combination)
                    if distance < dist or dist == -1:
                        dist = distance
                self.instance_min_distance_in_papers.at[paper_label,instance_label] = dist
        if save:
            self.save()
pos_in_paper = PosInPaper(config, papers, instances)

In [None]:
pos_in_paper.find_occurrences_in_texts(save=False)

In [17]:
pos_in_paper.find_all_combinations(save=False)

In [18]:
pos_in_paper.update_instance_min_distances(save=False)

In [None]:
pos_in_paper.instance_min_distance_in_papers

In [20]:
pos_in_paper.save()

### Error Matrix

In [None]:
# director.paper_full_text = paper_full_text

# director.builder["error_matrix_builder"] = ErrorMatrixBuilder(director, pos_in_paper)
# director.builder["error_matrix_builder"].build()

# director.builder["error_matrix_builder"].save()
# director.sort_instances()
# director.builder["occurrence_matrix"].save()


def build_error_matrix(
    config: Config, occurrence_matrix: pd.DataFrame, pos_in_paper: PosInPaper
):
    initial_occurrences = occurrence_matrix.sum().sum()
    removed = 0
    added = 0
    error_matrix = np.zeros(occurrence_matrix.shape, dtype=float)
    error_matrix = pd.DataFrame(
        error_matrix, index=occurrence_matrix.index, columns=occurrence_matrix.columns
    )
    for paper_label in occurrence_matrix.index:
        for instance_label in occurrence_matrix.columns:
            min_distance = pos_in_paper.instance_min_distance_in_papers.at[
                paper_label, instance_label
            ]
            current = occurrence_matrix.at[paper_label, instance_label]
            if not isinstance(current, np.int32):
                print(f"Wrong data type ({type(current)}) at {instance_label} : {paper_label}.")
                print(f"Data: {current}")
                raise ValueError("Wrong data in occurrence_matrix")
            if min_distance is None:
                pass
            if min_distance > config.gap_too_large_threshold:
                # print(f"Gap for {instance} in {paper} ({min_distance} > {GAP_TOO_LARGE_THRESHOLD})")
                if current:
                    occurrence_matrix.at[paper_label, instance_label] = 0
                    removed += 1
                # get log base 10 of min distance
                error_matrix.at[paper_label, instance_label] = round(
                    np.log10(min_distance), 1
                )

            # Some pieces may not be found in the full text
            elif min_distance == -1:
                # print(f"{instance} not found in {paper} at all")
                if current:
                    occurrence_matrix.at[paper_label, instance_label] = 0
                    removed += 1
                error_matrix.at[paper_label, instance_label] = min_distance
                # for these, we do not store the gap
                continue
            elif not current:
                occurrence_matrix.at[paper_label, instance_label] = 1
                added += 1
    final_occurrences = occurrence_matrix.sum().sum()

    print(f"Corrected occurrence_matrix from {initial_occurrences} to {final_occurrences} occurrences.")
    print(f"Removed {removed} and added {added} instance occurrences.")


    return error_matrix


error_matrix = build_error_matrix(config, occurrence_matrix, pos_in_paper)

In [22]:
# def reduce_to_existing(input_dict, matrix, axis=0, name=""):
#     before = len(input_dict)
#     presumed_name = ["columns", "rows"]
#     if not name:
#         name = presumed_name[axis]

#     if axis == 0:
#         # columns, likely instances
#         input_dict = {k: v for k, v in input_dict.items() if k in matrix.columns}
#     else:
#         # rows, likely papers
#         input_dict = {k: v for k, v in input_dict.items() if k in matrix.index}
#     after = len(input_dict)
#     if before != after:
#         print(f"Removed {before - after} {name}")
#     return input_dict

# instances = reduce_to_existing(instances, occurrence_matrix, axis=0, name="instances")
# papers = reduce_to_existing(papers, occurrence_matrix, axis=1, name="papers")

In [None]:
occurrence_matrix.sum()

In [None]:
occurrence_matrix

In [25]:
instances, occurrence_matrix = sort_instances(instances, instances_by_class, occurrence_matrix)
error_matrix = error_matrix.reindex(index=occurrence_matrix.index, columns=occurrence_matrix.columns)

In [None]:
occurrence_matrix.sum()

In [None]:
occurrence_matrix

In [28]:
# occurrence_matrix, deletions = remove_zeros(occurrence_matrix)
# error_matrix, deletions = remove_zeros(error_matrix)

### Instance_instance Co-occurrence Matrix

In [29]:
instance_instance_co_occurrence_matrix = occurrence_matrix.T.dot(occurrence_matrix)

In [None]:
instance_instance_co_occurrence_matrix

In [31]:
# check if nan values are present in instance_instance_co_occurrence_matrix
def clean_nans(matrix:pd.DataFrame):
    nans = matrix.isna().sum().sum()
    if nans:
        print(f"{nans} NaN values present in matrix")
        matrix.fillna(0, inplace=True)
    return matrix

instance_instance_co_occurrence_matrix = clean_nans(instance_instance_co_occurrence_matrix)

In [32]:
# check if nan values are present in instance_instance_co_occurrence_matrix
def clean_nans(matrix:pd.DataFrame):
    nans = matrix.isna().sum().sum()
    if nans:
        print(f"{nans} NaN values present in matrix")
        matrix.fillna(0, inplace=True)
    return matrix

instance_instance_co_occurrence_matrix = clean_nans(instance_instance_co_occurrence_matrix)

# Get the diagonal elements and make a copy
diag_elements = np.diag(instance_instance_co_occurrence_matrix).copy()

# Replace zero diagonal elements to avoid division by zero
diag_elements[diag_elements == 0] = 1

instance_instance_relative_co_occurrence_matrix = (
    instance_instance_co_occurrence_matrix
    # / np.diag(instance_instance_co_occurrence_matrix)
    / diag_elements
)

instance_instance_relative_co_occurrence_matrix = round(instance_instance_relative_co_occurrence_matrix, 3)
instance_instance_relative_co_occurrence_matrix = clean_nans(instance_instance_relative_co_occurrence_matrix)

In [None]:
instance_instance_relative_co_occurrence_matrix

## Additional Visualizations

In [34]:
def build_year_paper_occurrence_matrix(
    paper_instance_occurrence_matrix: pd.DataFrame, papers, is_error_matrix=False
):
    year_papers = {}
    for paper, instance in papers.items():
        if hasattr(instance, "year"):
            year = int(getattr(instance, "year"))
            if year not in year_papers:
                year_papers[year] = {}
            year_papers[year][paper] = instance

    earliest = min(year_papers)
    latest = max(year_papers)
    span = latest - earliest + 1

    for year in range(earliest, latest):
        if year not in year_papers:
            year_papers[year] = []

    year_papers = {
        k: v for k, v in sorted(year_papers.items(), key=lambda item: item[0])
    }

    if is_error_matrix:
        # convert any value != 0 to 1
        paper_instance_occurrence_matrix = np.where(
            paper_instance_occurrence_matrix != 0, 1, 0
        )

    # create a year_instance_occurrence matrix from the paper_instance_occurrence_matrix
    year_instance_occurrence_matrix = np.zeros(
        (span, paper_instance_occurrence_matrix.shape[1]), dtype=int
    )

    year_instance_occurrence_matrix = pd.DataFrame(
        year_instance_occurrence_matrix,
        index=list(year_papers.keys()),
        columns=paper_instance_occurrence_matrix.columns,
    )

    for year in year_papers:
        np_year = np.zeros(paper_instance_occurrence_matrix.shape[1], dtype=int)
        for paper in year_papers[year]:
            # add the instance occurrence matrix of the paper to the year matrix
            if paper in paper_instance_occurrence_matrix.index:
                np_year += paper_instance_occurrence_matrix.loc[paper]
        year_instance_occurrence_matrix.loc[year] = np_year

    return year_instance_occurrence_matrix, year_papers


year_instance_occurrence_matrix, year_papers = build_year_paper_occurrence_matrix(
    occurrence_matrix, papers
)

# year_instance_occurrence_matrix, year_papers = create_year_paper_occurrence_matrix(
#     papers_metadata, paper_instance_occurrence_matrix, papers
# )

In [None]:
year_instance_occurrence_matrix

# Setup Complete

We now have:

| Variable                          | Type    | Size         | Comments |
|-----------------------------------|---------|--------------|----------|
| error_instances                   | list    | 165          | Comments |
| error_matrix                      | ndarray | (999, 165)   | Comments |
| error_papers                      | list    | 999          | Comments |
| gap_too_large_threshold           | int     | n.a.         | Comments |
| instance_piece_gap                | dict    | 151          | Comments |
| instance_types_dicts              | dict    | 5            | Comments |
| instances                         | list    | 315          | Comments |
| paper_full_text                   | dict    | 1029         | Comments |
| paper_instance_occurrence_matrix  | ndarray | (1003, 315)  | Comments |
| papers                            | list    | 1003         | Comments |
| pos_in_paper                      | dict    | 1003         | Comments |

Consisting of:
* The paper_instance_occurrence_matrix, binary listing if a term (instance) is present in a paper
  * papers x instances
* The error_matrix, of all instances that were dropped from the paper_instance_occurrence_matrix
  * error_papers x error_instances

And some leftover variables:
* instance_types_dicts, listing all instance types ("process", "software", ...) and their respective instance sets ("Curation", "Knowledge Work", ...)
* paper_full_text, containing each papers full text
  * pos_in_paper, listing for each paper: for each instance: each position of that instance in that papers full text.
* instance_piece_gap, a dict listing all instances made up from compound words (e.g. "Knowledge Work", and their minimum distance in each papers full text)
  * gap_too_large_threshold, defining how far appart a finding of "Knowledge" and "Work" would qualify as "Knowledge Work"

In [36]:
# ~3 min | {( len(papers) * len(instances) ) / (3 * 1000) }seconds  compare proximity of all instances with one antoher
# ~8 min right now.
# 3 min 30 sec with 164 papers and 339 instances
class ProximityMatrixBuilder(MatrixBuilder):
    def __init__(self, director:Director, instances = None, papers = None, pos_in_paper = None, mode = "sqrt"):
        super().__init__(director)

        self.instances:dict[str,Instance] = instances or director.instances
        self.papers:dict[str,Instance] = papers or director.papers
        self.pos_in_paper:PosInPaper = pos_in_paper or director.pos_in_paper

        self.mode = mode

    def build_matrix(self, instances = None, papers = None, pos_in_paper = None):
        instances = instances or self.instances
        papers = papers or self.papers
        pos_in_paper = pos_in_paper or self.pos_in_paper

        # self.matrix, self.proximity_instances = calculate_proximity_matrix(
        #     self.config, pos_in_paper, instances, mode="sqrt"
        # )

    def build(self):
        self.build_matrix()
        self.remove_zeros()
        self.instances = self.handle_deletions(self.instances)

    @time_function
    def build_matrix(self,
        # config: Config,
        # pos_in_paper: PosInPaper,
        # instances,
        # mode="sqrt",
        try_to_save_time=False,
    ):
        # TODO: Optimize this function.
        # each instance needs to have it's occurrences as pieces clustered together, so that only those below max distance are considered

        # create a np zeros matrix of size instances x instances
        indexed_instances = {instance: i for i, instance in enumerate(self.instances)}

        self.matrix = np.zeros(
            (len(self.instances), len(self.instances)), dtype=float
        )

        # alternatives are:
        # "sqrt" - 1 / (square root of the distance)
        # "linear" - 1 / distance
        # "binary" - 1 if distance < MAX_GAP_THRESHOLD, 0 otherwise
        # "log" - 1 / log(distance)

        # There is a chance that pos_in_paper papers and instances are out of sync with the current papers and instances
        paperIDs = [
            paperID for paperID, name in enumerate(pos_in_paper.papers) if name in self.papers
        ]
        lID_map = {
            indexed_instances[name]: instanceID
            for instanceID, name in enumerate(pos_in_paper.literals)
            if name in self.instances
        }

        for id1 in range(len(self.instances)):
            # print (f"Processing {id1} of {len(instances)}: {instance1}")
            for id2 in range(id1 + 1, len(self.instances)):
                # FIXME: this resulted in a matrix which was not symmetric.
                # That hints at a problem with the calclulation, [id1][id2] and [id2][id1] should be the same
                wcID = pos_in_paper.word_combination_index_literal_literal[lID_map[id1]][
                    lID_map[id2]
                ]
                for paperID in paperIDs:
                    distance = pos_in_paper.find_min_distance_by_id(paperID, wcID)

                    if distance < 0:
                        # print(f"Error: {instance1} and {instance2} not found in {paper}")
                        continue
                    result = 0.0
                    if distance == 0:
                        result = 1
                    elif distance == 1:
                        result = 1
                    elif self.mode == "sqrt":
                        result = 1 / np.sqrt(distance)
                    elif self.mode == "linear":
                        result = 1 / distance
                    elif self.mode == "binary":
                        result = 1 if distance < config.gap_too_large_threshold else 0
                    elif self.mode == "log":
                        result = 1 / np.log(distance)
                    else:
                        print("Error: unknown mode")
                        break
                    if result > 0.0:
                        self.matrix[id1][id2] += result
                        self.matrix[id2][id1] += result

        # TODO rest doesnt seem to work, short fix implemented:
        # create a copy of labels that only contains instances that are in the proximity matrix

        # instance_instance_proximity_matrix, deletions = remove_zeros(
        #     instance_instance_proximity_matrix
        # )
        # proximity_instances = handle_deletions(instances, deletions, rows=False)



In [37]:
# director.pos_in_paper = pos_in_paper
# director.builder['proximity_matrix'] = ProximityMatrixBuilder(director)
# director.builder['proximity_matrix'].build()
# # instance_instance_proximity_matrix, proximity_instances = calculate_proximity_matrix(
# #     config, pos_in_paper, instances
# # )

# Knowledge Graph creation

In [38]:
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori


def get_rules(matrix: pd.DataFrame, columns):
    # AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'
    if not isinstance(matrix, pd.DataFrame):
        matrix = pd.DataFrame(matrix, columns=columns)
    dataframe = matrix.astype(bool)

    # for each process:
    # create one res

    res = apriori(dataframe, min_support=0.4, use_colnames=True, max_len=2)

    # visualize res
    res = res.sort_values(by="support", ascending=False)
    res = res.reset_index(drop=True)
    # res

    rules = association_rules(res)
    # sort rules by confidence
    # rules = rules.sort_values(by='confidence', ascending=False)
    rules = rules.sort_values(by="lift", ascending=False)  # (propably most important)
    # rules = rules.sort_values(by='leverage', ascending=False)
    # export rules to csv
    return rules


rules = get_rules(occurrence_matrix, list(instances.keys()))

In [None]:
from bnw_tools.SLR.export import process_dataframe
process_dataframe(config, rules, "rules")

In [40]:
def identify_cross_type_rules(rules, instances: dict[str, Instance]):
    cross_type = [False] * len(rules)

    for i, antecentent in enumerate(rules.antecedents):
        if not isinstance(antecentent, str):
            (antecentent,) = antecentent
        consequent = rules.iloc[i].consequents
        if not isinstance(consequent, str):
            (consequent,) = consequent
        type1, type2 = None, None
        type1 = instances.get(antecentent, {}).get("instance_of", [None])[0]
        type2 = instances.get(consequent, {}).get("instance_of", [None])[0]
        if type1 and type2 and type1 != type2:
            cross_type[i] = True

    # create a copy for all rules that are cross type
    rules_cross_type = rules[cross_type].copy()
    return rules_cross_type


rules_cross_type = identify_cross_type_rules(rules, instances)

In [None]:
process_dataframe(config, rules_cross_type, "rules_cross_type")

# Output

In [43]:
# for name, builder in director.builder.items():
#     if not hasattr(builder, "matrix"):
#         continue
#     if name in ["year_instance_occurrence_matrix"]:
#         continue
#     rows = []
#     candidates = ["papers", "instances"]
#     for candidate in candidates:
#         if hasattr(builder, candidate):
#             rows = getattr(builder, candidate)
#             break
#     if not rows:
#         raise Exception(f"Could not find rows for {name}")
#     if isinstance(rows, dict):
#         rows = list(rows.keys())
    
#     cols = []
#     candidates = ["instances", "literals"]
#     for candidate in candidates:
#         if hasattr(builder, candidate):
#             cols = getattr(builder, candidate)
#             break
#     if not cols:
#         raise Exception(f"Could not find cols for {name}")
#     if isinstance(cols, dict):
#         cols = list(cols.keys())
    
#     builder.save()
#     # process_matrix(director.config, builder.matrix, rows, cols, name)

## Functions

## Files

### Paper x Instance

In [44]:
def matrix_to_csv(
    config: Config, matrix: pd.DataFrame, name: str = "some_matrix", path=None
) -> None:
    if path is None:
        path = config.get_output_path()
    filepath = os.path.join(path, name)
    matrix.to_csv(
        filepath + ".csv",
        sep=config.csv_separator,
        decimal=config.csv_decimal,
    )


def get_figsize(
    config: Config,
    size_x: float = None,
    size_y: float = None,
    column_len: int = None,
    row_len: int = None,
) -> tuple[float, float]:
    if size_x and size_y:
        return size_x, size_y
    ## Calculate the maximum size of the plot
    dpi = 300
    max_dpi = 600
    if config.for_git:
        dpi = 96
        max_dpi = 200
    max_pixel = 2**16  # Maximum size in any direction
    max_size = max_pixel / dpi  # Maximum size in any direction
    max_size_total = max_size * max_size  # Maximum size in total
    max_size_total *= 0.05  # produce smaller files

    # Experience value of space required per cell
    factor = 0.18
    size_x: float = 2 + column_len * factor
    size_y: float = 3 + row_len * 0.8 * factor

    while size_x * size_y < max_size_total and dpi < max_dpi:
        dpi /= 0.95
        max_size_total *= 0.95

    if dpi > max_dpi:
        dpi = max_dpi

    while size_x * size_y > max_size_total:
        dpi *= 0.95
        max_size_total /= 0.95

    size_x = size_x
    size_y = size_y
    dpi = dpi

    return [size_x, size_y], dpi


def visualize_matrix(
    config: Config,
    matrix: pd.DataFrame,
    name: str = "some_matrix",
    format=".png",
    path=None,
    instance_label_dict: dict[str, str] = None,
) -> None:
    if not path:
        path = config.get_output_path(path, visualization=True)
    # Ensure all data in the DataFrame is numeric

    # keep only entries > 0
    matrix = matrix.loc[(matrix.sum(axis=1) != 0), (matrix.sum(axis=0) != 0)]

    rows = list(matrix.index)
    columns = list(matrix.columns)
    
    # #rename labels to instance labels
    if instance_label_dict:
        rows = [instance_label_dict.get(row, row) for row in rows]
        columns = [instance_label_dict.get(col, col) for col in columns]

    row_sums = list(matrix.sum(axis=1))
    ## Removed the summation of the rows/cols since they overcomplicated the visualization
    ## rows = [f"{row} ({row_sums[i]})" for i, row in enumerate(rows)]
    col_sums = list(matrix.sum(axis=0))
    ## columns = [f"({col_sums[i]}) {col}" for i, col in enumerate(columns)]

    # matrix = matrix.apply(pd.to_numeric, errors='coerce')
    matrix = matrix.values
    figsize, dpi = get_figsize(config, column_len=len(columns), row_len=len(rows))
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)

    cax = ax.matshow(matrix, cmap="viridis")

    # use labels from instance_occurrences
    ax.set_xticks(range(len(columns)))
    ax.set_xticklabels(list(columns), fontsize=10, rotation=90)
    ax.set_yticks(range(len(rows)))
    ax.set_yticklabels(list(rows), fontsize=10)

    # # adjust the spacing between the labels
    # plt.gca().tick_params(axis='x', which='major', pad=15)
    # plt.gca().tick_params(axis='y', which='major', pad=15)

    # show the number of co-occurrences in each cell, if greater than 0
    force_cell_values = True

    if getattr(config, "visualize_cell_values", False) or force_cell_values:
        # for i, row in matrix.iterrows():
        for i, row in enumerate(matrix):
            # for j, value in row.items():
            for j, value in enumerate(row):
                if value == 0:
                    continue
                # if co_occurrences[i, j] > 100:
                #     continue

                # make sure the text is at most 3 digits and a dot
                decimals = 2
                if value > 99:
                    decimals = 0
                elif value > 9:
                    decimals = 1
                cell_text = round(value, decimals)
                if decimals == 0:
                    cell_text = int(cell_text)
                plt.text(
                    j, i, cell_text, ha="center", va="center", color="white", fontsize=4
                )

    # plt.show()
    try:
        fig.tight_layout()
    except Exception as e:
        print(f"Error during tight_layout: {e}")

    # title
    plt.title(name)

    if isinstance(format, list):
        for f in format:
            if f[0] != ".":
                f = "." + f
            filepath = os.path.join(path, name + f)
            fig.savefig(filepath)
            fig.savefig(filepath[:-4] + ".pdf")
    else:
        if format[0] != ".":
            format = "." + format
        filepath = os.path.join(path, name + format)
        fig.savefig(filepath)
        fig.savefig(filepath[:-4] + ".pdf")


def visualize_matrix_graph(
    config: Config,
    matrix,
    instance_types_dicts,
    instances: list[str] = None,
    name="some_matrix_graph",
    path=None,
    node_size_mode="sqrt",
    raise_mode="prune",
    instance_label_dict: dict[str, str] = None,
):
    if isinstance(matrix, pd.DataFrame):
        # drop all rows and columns that are all 0
        instances = list(matrix.columns)
        matrix = matrix.values
    if not instances:
        return

    SEED = config.proximity_seed or 17
    K_SPRRING = config.proximity_k_spring or 18
    MIN_VALUE = config.proximity_min_value or 0.01

    matrix = matrix.copy()
    
    np.fill_diagonal(matrix, 0)

    # normalize the proximity matrix
    matrix = matrix / matrix.max()

    # Make sure the matrix is not completely stretched out
    if matrix.min() < MIN_VALUE:
        if raise_mode == "prune":
            # remove every value that is below MIN_VALUE
            matrix = np.where(matrix < MIN_VALUE, 0, matrix)
            # matrix = matrix.applymap(lambda x: 0 if x < MIN_VALUE else x)

        elif raise_mode == "sqrt":
            # while matrix[matrix > 0].min().min() < MIN_VALUE:
            while np.min(matrix[np.nonzero(matrix)]) < MIN_VALUE:
                matrix = np.sqrt(matrix)
                # matrix = matrix.applymap(np.sqrt)

        else:
            raise ValueError("Unknown raise mode")

    # alternatives are:
    # "linear" - take proximity as is
    # "sqrt" - sqrt(proximity)
    # "log" - log(proximity)
    if node_size_mode == "log":
        # TODO: see how this works with log(1)
        nodesize_map = [np.log(matrix[:, i].sum() + 1) for i in range(len(instances))]
    elif node_size_mode == "sqrt":
        nodesize_map = [np.sqrt(matrix[:, i].sum()) for i in range(len(instances))]
    elif node_size_mode == "linear":
        nodesize_map = [matrix[:, i].sum() for i in range(len(instances))]
    else:
        nodesize_map = [matrix[:, i].sum() for i in range(len(instances))]

    # print(max(nodesize_map))
    # print(min(nodesize_map))

    # drop all nodes that have no connections
    drop_map = [xID for xID, x in enumerate(nodesize_map) if x == 0]
    for i in reversed(drop_map):
        matrix = np.delete(matrix, i, 0)
        matrix = np.delete(matrix, i, 1)
        nodesize_map.pop(i)
        instances.pop(i)

    nodesize_map = np.array(nodesize_map) / max(nodesize_map) * 1000

    # print(max(nodesize_map))
    # print(min(nodesize_map))

    # scale = len(instances) * 0.12
    scale = len(instances) * 0.15
    # Create a new figure
    x = scale / 10 * 16
    y = scale / 10 * 9
    fig = plt.figure(figsize=(x, y))

    # Create a graph from the proximity matrix
    G = nx.from_numpy_array(matrix)

    # Specify the layout
    pos = nx.spring_layout(
        G, seed=SEED, k=K_SPRRING / math.sqrt(G.order())
    )  # Seed for reproducibility

    color_map = []

    color = {
        "process": "#1f77b4",  # muted blue
        "software": "#ff7f0e",  # safety orange
        "data item": "#2ca02c",  # cooked asparagus green
        "data model": "#d62728",  # brick red
        "data format specification": "#9467bd",  # muted purple
        # "interchange format": "#8c564b",  # chestnut brown
        # "source": "#e377c2",  # raspberry yogurt pink
    }

    for instance in instances:
        # if drop_map and drop_map[xin]:
        #     continue
        added = False
        for instance_type in instance_types_dicts:
            if instance in instance_types_dicts[instance_type]:
                color_map.append(color[instance_type])
                added = True
                break
        if not added:
            color_map.append("grey")

    # Draw the graph
    options = {
        "edge_color": "grey",
        "linewidths": 0.5,
        "width": 0.5,
        "with_labels": True,  # This will add labels to the nodes
        "labels": {i: label for i, label in enumerate(instances)},
        "node_color": color_map,
        "node_size": nodesize_map,
        # "edge_color": "white",
        # "alpha": 0.9,
    }

    if instance_label_dict:
        options['labels'] = {i: instance_label_dict.get(label, label) for i, label in enumerate(instances)}

    # print(nx.is_weighted(G))

    # nx.set_edge_attributes(G, values = 1, name = 'weight')

    nx.draw(G, pos, **options, ax=fig.add_subplot(111))

    # Make the graph more spacious
    fig.subplots_adjust(bottom=0.1, top=0.9, left=0.1, right=0.9)

    # Create a patch for each color
    patches = [mpatches.Patch(color=color[key], label=key) for key in color]

    # Add the legend to the graph
    plt.legend(handles=patches, loc="upper right", fontsize="x-large")

    plt.show()

    # save plot to file
    filepath = os.path.join(path, name)
    fig.savefig(filepath + ".png")
    fig.savefig(filepath + ".svg")
    fig.savefig(filepath + ".pdf")

    # nx.get_edge_attributes(G, 'weight')


def process_matrix(
    config: Config,
    matrix: pd.DataFrame,
    instances_by_class=None,
    name="some_matrix",
    path=None,
    instances=None,
    mode=None,
    instance_label_dict: dict[str, str] = None,
):
    if not path:
        path = config.get_output_path(path)
    matrix_to_csv(config, matrix, name, path)

    if config.visualize:
        path = config.get_output_path(path, visualization=True)
        try:
            visualize_matrix(
                config,
                matrix,
                name,
                path=path,
                instance_label_dict=instance_label_dict,
            )
        except Exception as e:
            print(f"Error during visualization: {e}")

        # if instances:
        # if list(matrix.columns) == list(matrix.index):
        # if False:
        if instances_by_class:
            # matrix_to_sankey(
            #     config,
            #     matrix,
            #     name + "_sankey",
            #     path=path,
            # )
            try:
                if mode:
                    visualize_matrix_graph(
                        config,
                        matrix,
                        instances_by_class,
                        name=name + "_graph",
                        path=path,
                        node_size_mode=config.proximity_mode,
                        instance_label_dict=instance_label_dict,
                    )
                else:
                    visualize_matrix_graph(
                        config,
                        matrix,
                        instances_by_class,
                        name=name + "_graph",
                        path=path,
                        instance_label_dict=instance_label_dict,
                    )
            except Exception as e:
                print(f"Error during graph visualization: {e}")

In [45]:
config.visualize = True

In [None]:
# from bnw_tools.SLR.export import process_dict, process_matrix

process_matrix(
    config,
    occurrence_matrix,
    name="paper_instance_occurrence_matrix",
    instance_label_dict=instance_label_dict,
)

In [None]:
process_matrix(
    config,
    error_matrix,
    name="error_matrix",
    instance_label_dict=instance_label_dict,
)

### Instance x Instance

In [None]:
process_matrix(
    config,
    instance_instance_co_occurrence_matrix,
    instances_by_class=instances_by_class,
    name="instance_instance_co_occurrence_matrix",
    instance_label_dict=instance_label_dict,
)

In [None]:
process_matrix(
    config,
    instance_instance_relative_co_occurrence_matrix,
    instances_by_class=instances_by_class,
    name="instance_instance_relative_co_occurrence_matrix",
    instance_label_dict=instance_label_dict,
)

In [51]:
# process_matrix(
#     config,
#     instance_instance_proximity_matrix,
#     name="instance_instance_proximity_matrix",
#     instance_types_dicts=instance_types_dicts,
# )

In [52]:
# visualize timeline
import numpy as np
import matplotlib.pyplot as plt
import math


def visualize_timeline(
    config: Config,
    year_instance_occurrence_matrix:pd.DataFrame,
    year_papers,
    # instances,
    instance_types_dicts,
    name="some_timeline",
    path=None,
    recursion_depth=0,
    start_index=0,
    error_matrix=None,
    error_instances=None,
    instance_label_dict: dict[str, str] = None,
):
    if not path:
        path = config.get_output_path(path, visualization=True)
    years = list(year_papers.keys())
    # years = list(year_instance_occurrence_matrix.index)
    instances = list(year_instance_occurrence_matrix.columns)
    yearly_papers = [len(year_papers[year]) for year in years]
    # yearly_papers = year_instance_occurrence_matrix.sum(axis=1)
    # max_papers = max([len(year_papers[year]) for year in years])
    max_papers = max(yearly_papers)

    ALPHA_ERROR_LINE = 0.3
    ALPHA_ERROR_ZONE = 0.2
    ALPHA_PAPER_BAR = 0.3

    for type in instance_types_dicts:
        use = [instance in instance_types_dicts[type] for instance in instances]
        type_instances = [
            instance for instance, use_flag in zip(instances, use) if use_flag
        ]
        total_occurrences = [
            # np.sum(year_instance_occurrence_matrix[:, instances.index(instance)])
            year_instance_occurrence_matrix[instance].sum()
            for instance in type_instances
        ]
        type_instances_sorted = [
            x 
            for occurrence, x in sorted(
                zip(total_occurrences, type_instances),
                key=lambda pair: pair[0],
                reverse=True,
            )
            if occurrence != 0
        ]

        PARTITION_SIZE = 10
        # if error_instances is not None:
        #     PARTITION_SIZE = int(0.5 * PARTITION_SIZE)

        # type_matrix = year_instance_occurrence_matrix[
        #     :, [instances.index(instance) for instance in type_instances_sorted]
        # ]

        # Get the column indices for type_instances_sorted
        column_indices = [year_instance_occurrence_matrix.columns.get_loc(instance) for instance in type_instances_sorted]

        # Select the rows and the specified columns using iloc
        type_matrix = year_instance_occurrence_matrix.iloc[:, column_indices]
        

        factor = 1
        size_x = (2 + len(years) / 6) * factor
        size_y = (2 + max_papers / 15) * factor
        size_y_2 = (2 + PARTITION_SIZE / 2) * factor
        size_y = max(size_y, size_y_2)
        fig, ax = plt.subplots(figsize=(size_x, size_y), dpi=300)

        ax.set_xticks(range(len(years)))
        # years_labels = [year if len(year_papers[year]) > 0 else "" for year in years]
        # years_labels = [year if yearly_papers[year] > 0 else "" for year in years]
        years_labels = [year if yearly_papers[yID] > 0 else "" for yID, year in enumerate(years)]
        ax.set_xticklabels(years_labels, fontsize=10, rotation=90)

        step_size = max(1, math.ceil(max_papers / 10))
        ax.set_yticks(np.arange(0, max_papers + 1, step=step_size))
        ax.set_yticklabels(
            [str(int(x)) for x in np.arange(0, max_papers + 1, step=step_size)],
            fontsize=10,
        )

        # set y axis label
        ax.set_ylabel("absolute", fontsize=10)

        plt.bar(
            range(len(years)),
            yearly_papers,
            color="black",
            alpha=ALPHA_PAPER_BAR,
            label=f"Total papers ({sum(yearly_papers)})",
            zorder=0,
        )

        line_count = 0
        i = start_index
        while line_count < PARTITION_SIZE and i < len(type_instances_sorted):
            instance = type_instances_sorted[i]
            # yearly_occurrences = type_matrix[:, i]
            yearly_occurrences = type_matrix[instance]
            i_total_occurrences = yearly_occurrences.sum()
            instance_label = instance
            if instance_label_dict:
                instance_label = instance_label_dict.get(instance, instance)
            label = f"{instance_label} ({i_total_occurrences})"
            values = yearly_occurrences
            line = plt.plot(range(len(years)), values, label=label, zorder=3)[0]
            line_count += 1
            if error_matrix is not None and instance in error_instances:
                color = line.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                line.set_label(f"{instance_label} ({i_total_occurrences}-{sum(errors_plus)})")
                # Plot the error as a half transparent line on top of the normal line
                plt.plot(
                    range(len(years)),
                    errors_plus,
                    color=color,
                    alpha=ALPHA_ERROR_LINE,
                    label=f"{instance_label} (w/o proximity)",
                    zorder=2,
                )
                line_count += 1
                # color in the area between the normal line and the error line
                plt.fill_between(
                    range(len(years)),
                    yearly_occurrences,
                    errors_plus,
                    color=color,
                    alpha=ALPHA_ERROR_ZONE,
                    zorder=1,
                )
            i += 1

            # plt.scatter(range(len(years)), errors, color='red', label=f"{instance} (error)", zorder=1)
        stop_index = i

        plt.legend()

        plt.title(
            f"Number of papers covering {type} instances (#{start_index+1} to #{stop_index} of {len(type_instances_sorted)})"
        )

        # Inset for relative values
        fig.canvas.draw()
        x_lim = ax.get_xlim()  # Get the current x-axis limits from the main plot

        bbox = ax.get_position()
        bb_left, bb_bottom = bbox.x0, bbox.y0
        bb_width, bb_height = bbox.width, bbox.height

        ax_inset = plt.axes(
            [bb_left, 0.05, bb_width, 0.15],
            alpha=ALPHA_PAPER_BAR,
            facecolor="lightgrey",
        )
        for i, instance in enumerate(
            type_instances_sorted[start_index:stop_index], start=start_index
        ):
            # yearly_occurrences = type_matrix[:, i]
            yearly_occurrences = type_matrix[instance]
            values_relative = [
                occurrences / papers if papers > 0 else 0
                for occurrences, papers in zip(yearly_occurrences, yearly_papers)
            ]
            instance_label = instance
            if instance_label_dict:
                instance_label = instance_label_dict.get(instance, instance)
            line_relative = ax_inset.plot(
                range(len(years)),
                values_relative,
                label=f"{instance_label} (relative)",
                zorder=3,
            )[0]

            # add the error part
            if error_matrix is not None and instance in error_instances:
                color = line_relative.get_color()
                errors = error_matrix[:, error_instances.index(instance)]
                errors_plus = yearly_occurrences + errors
                errors_relative = [
                    error / papers if papers > 0 else 0
                    for error, papers in zip(errors_plus, yearly_papers)
                ]
                if max(errors_relative) > 1:
                    print(f"Error: {instance_label} has a relative error > 1")
                    # throw an exception because this should never be the case:
                    # raise Exception(f"Error: relative {instance} occurrence + error > 1")

                ax_inset.plot(
                    range(len(years)),
                    errors_relative,
                    alpha=ALPHA_ERROR_LINE,
                    color=color,
                    label=f"{instance_label} (error, relative)",
                    zorder=2,
                )
                # color in the area between the normal line and the error line
                ax_inset.fill_between(
                    range(len(years)),
                    values_relative,
                    errors_relative,
                    alpha=ALPHA_ERROR_ZONE,
                    color=color,
                    zorder=1,
                )

        ax_inset.set_xlim(x_lim)

        ax_inset.set_xticks([])
        ax_inset.set_yticks(np.arange(0, 1.1, step=0.5))
        ax_inset.set_yticklabels(
            [f"{int(x*100)}%" for x in np.arange(0, 1.1, step=0.5)], fontsize=8
        )

        # set y axis label
        ax_inset.set_ylabel("relative", fontsize=10)

        plt.subplots_adjust(bottom=0.3)

        start_string = f"{start_index+1}"
        stop_string = f"{stop_index}"

        # fill up with 0 to have a constant length
        start_string = "0" * (3 - len(start_string)) + start_string
        stop_string = "0" * (3 - len(stop_string)) + stop_string

        part_appendix = f"{start_string}_to_{stop_string}"
        filepath = os.path.join(path, name)
        plt.savefig(f"{filepath}_{type.replace(' ', '_')}_{part_appendix}.png")
        plt.close()

        start_index = stop_index
        if start_index < len(type_instances_sorted):
            # if recursion_depth > 0:
            #     break
            visualize_timeline(
                config,
                year_instance_occurrence_matrix,
                year_papers,
                # instances,
                {type: instance_types_dicts[type]},
                name,
                path=path,
                recursion_depth=recursion_depth + 1,
                start_index=start_index,
                error_matrix=error_matrix,
                error_instances=error_instances,
                instance_label_dict=instance_label_dict,
            )
        start_index = 0


if config.visualize:
    # yearly_error_matrix, year_error_papers = create_year_paper_occurrence_matrix(
        # papers_metadata, error_matrix, error_papers, is_error_matrix=True
    # )
    visualize_timeline(
        config,
        year_instance_occurrence_matrix,
        year_papers,
        instances_by_class,
        # instance_types_dicts,
        name="year_instance_occurrence_matrix",
        # error_matrix=yearly_error_matrix,
        # error_instances=error_instances,
        instance_label_dict=instance_label_dict,
    )

# Approach

## Pre-Processing
Using Completion Rating in %

### 80 %: Full Text extraction
* lacking noise removal (Headings, page numbers, ...)
* lacking line-break mending

### 100 %: Bag of Words
* The problem with BoW that the words are looked at seperatly and correlation is not really clear.


### 99 %: TF-IDF
* tf-idf only on terms

### ? %: Part Of Speech (POS) Tagging, Named Entity Recognition (NER) 
* ready, but not used currently

## Visualize

### 85 % Matrix
* CSV and Dataframe dumps work fine
* Visualization as PNG or SVG are extremely large.
  * DPI regulation works to somewhat keep this in check, but images still reach 20 MB
* An interactive matrix would be preferred.
  * If you hover on a cell, it shows you the x and y label and it's value.

### 100 % Timeline
* arrange the papers on a timeline and identify the flow of:
  * Processes
  * File formats
  * software
  * ...
* Additional ideas:
  * Compare this to goolge trends

### GraphDB
* Visualize

## Future Work
Using Difficulty ranked (DR) solutions:

### Step 0: Look it up

#### Wikidata linking & more
* https://openrefine.org/

#### More visualization
* https://github.com/JasonKessler/scattertext 
* https://pypi.org/project/yellowbrick/

#### NLP Pipelines:
https://spacy.io/usage/processing-pipelines


#### BLAST: Basic Local Alignment Search Tool
  * starting point: https://academic.oup.com/bioinformatics/article/39/12/btad716/7450067

#### AMIE 3
  * https://luisgalarraga.de/docs/amie3.pdf
  * https://github.com/dig-team/amie

### Step 1: Low hanging fruits

#### 1/5 DR: multi-word detection (n-gram)
Tools:  nltk, spaCy, etc.

### Step 2: Not-to-tricky follow-up

#### 3/5 DR: Acronym Expansion
Tools: spaCy - https://spacy.io/universe/project/neuralcoref

#### 3/5 DR: CoReference resolution
Tools: spaCy - https://spacy.io/universe/project/neuralcoref or https://huggingface.co/coref/ (you can use the model out of the box)

### Step 3: Vector-magic

#### 2-4/5 DR: Word embedding
* Find out, that jpeg and png are similar

(depending on your needs) - Tools: gensim - https://www.analyticsvidhya.com/blog/2023/07/step-by-step-guide-to-word2vec-with-gensim/

#### 3/5 DR: document embedding
Tools: gensim - https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

I would also check graph embeddings, sentence embeddings, and recently there is LLM2Vec

### Step 3.1: Reaping the vector-rewards

#### 1/5 DR: clustering
Tools: sklearn

Requirements: Need to have data as numbers first. This is quite possible after generating embeddings

### Step 9: Won't be happening in this paper
* Paper classes
* Subclasses of paper classes
* model which process is a subprocess of another process