## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random


In [2]:
# packages used by helper functions
import uuid

In [3]:
# packages for prompting definitions
import sys
sys.path.append("..")
import json

In [4]:
from langchain_community.llms import Ollama

In [5]:
import ollama
from ollama import Client
client = Client(host='http://192.168.178.39:11434')
modelo="gemma:latest"

## Prompt definitions (function to extract triplets in JSON-format for knowledge graph from text chunks)

In [6]:
#################################
# Definition of used LLM
#################################
##########################################################################
def graphPrompt(input: str, metadata={}, model=modelo):
    if model == None:
        model = modelo
    
    chunk_id = metadata.get('chunk_id', None)

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = ("You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include Anforderung, requirement (AFO), rfc, organization, date, duration, \n"
            "\tcondition, concept, object, entity  etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them like the follwing. NEVER change the value of the chunk_ID as defined in this prompt: \n"
        "[\n"
        "   {\n"
        '       "chunk_id": "CHUNK_ID_GOES_HERE",\n'
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n' 
        "   }, {...}\n"
        "]"
    )
    SYS_PROMPT = SYS_PROMPT.replace('CHUNK_ID_GOES_HERE', chunk_id)

    USER_PROMPT = f"context: ```{input}``` \n\n output: "

    response = client.generate(model=modelo, system=SYS_PROMPT, prompt=USER_PROMPT)

    aux1 = response['response']
    # Find the index of the first open bracket '['
    start_index = aux1.find('[')
    # Slice the string from start_index to extract the JSON part and fix an unexpected problem with insertes escapes (WHY ?)
    json_string = aux1[start_index:]
    json_string = json_string.replace('\\\\\_', '_')
    json_string = json_string.replace('\\\\_', '_')
    json_string = json_string.replace('\\\_', '_')
    json_string = json_string.replace('\\_', '_')
    json_string = json_string.replace('\_', '_')
    json_string.lstrip() # eliminate eventual leading blank spaces
#####################################################
    print("json-string:\n" + json_string)
#####################################################         
    try:
        result = json.loads(json_string)
        result = [dict(item) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    print("§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§")

    return result

## Functions

In [7]:
def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [8]:
def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [9]:
def graph2Df(nodes_list) -> pd.DataFrame:
    # Erstellen des DataFrames aus der Knotenliste
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    # Entfernen aller Zeilen, in denen 'node_1' oder 'node_2' NaN ist
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    
    # Sicherstellen, dass 'node_1' und 'node_2' als Strings behandelt werden,
    # um den TypeError abzufangen
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: str(x).lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: str(x).lower())

    return graph_dataframe

def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

In [10]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    # Umwandlung des DataFrames
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    
    # Self-Join mit 'chunk_id' als Schlüssel
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    
    # Entfernung von Schleifen
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    
    # Stellen Sie sicher, dass 'chunk_id' als String behandelt wird
    dfg2["chunk_id"] = dfg2["chunk_id"].astype(str)
    
    # Gruppierung und Zählung von Kanten
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    
    # Entfernen von Kanten mit nur einem Vorkommen
    dfg2 = dfg2[dfg2["count"] > 1]
    dfg2["edge"] = "contextual proximity"
    
    return dfg2

## Variables

In [11]:
import pdfplumber
## Input data directory
##########################################################
input_file_name = "gemSpec_VZD_V1.14.0.pdf"
##########################################################
data_dir = "./HotG_Data/"+input_file_name
inputdirectory = Path(f"./{data_dir+'.txt'}")

with pdfplumber.open(data_dir) as pdf:
    # Leeren String zum Speichern des Texts
    text = ''
    
    # Über alle Seiten iterieren
    for page in pdf.pages:
        # Text der aktuellen Seite extrahieren und an den Gesamttext anhängen
        text += page.extract_text()

# Text in eine TXT-Datei schreiben
with open(data_dir+'.txt', 'w', encoding='utf-8') as txt_file:
    txt_file.write(text)

## This is where the output csv files will be written
outputdirectory = Path(f"./data_output")

print(inputdirectory)
print(input_file_name)

HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt
gemSpec_VZD_V1.14.0.pdf


In [12]:
input_file_name=input_file_name+'.txt'
output_graph_file_name = f"graph_{input_file_name[:-4]}.csv"
output_graph_file_with_path = outputdirectory/output_graph_file_name

output_chunks_file_name = f"chunks_{input_file_name[:-4]}.csv"
output_chunks_file_with_path = outputdirectory/output_chunks_file_name

output_context_prox_file_name = f"graph_contex_prox_{input_file_name[:-4]}.csv"
output_context_prox_file_with_path = outputdirectory/output_context_prox_file_name

print(output_graph_file_with_path)
print(output_chunks_file_with_path)
print(output_context_prox_file_with_path)

data_output/graph_gemSpec_VZD_V1.14.0.pdf.csv
data_output/chunks_gemSpec_VZD_V1.14.0.pdf.csv
data_output/graph_contex_prox_gemSpec_VZD_V1.14.0.pdf.csv


## Load Documents

In [13]:
#loader = TextLoader("./HotG_Data/Hanse.txt")
loader = TextLoader(inputdirectory)
Document = loader.load()
# clean unnecessary line breaks
Document[0].page_content = Document[0].page_content.replace("\n", " ")

In [14]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(Document)
print(Document[0])
Document2 = ''.join(str(item) for item in Document[0])
print(Document2)
from mattsollamatools import chunk_text_by_sentences
#pages = chunk_text_by_sentences(source_text=Document2, sentences_per_chunk=7, overlap=0 )

print("Number of chunks = ", len(pages))
print(pages[1].page_content)

page_content='Elektronische Gesundheitskarte und Telematikinfrastruktur Spezifikation Verzeichnisdienst Version: 1.14.0 Revision: 434138 Stand: 31.01.2022 Status: freigegeben Klassifizierung: öffentlich Referenzierung: gemSpec_VZD gemSpec_VZD_V1.docx Spezifikation Seite 1 von 72 Version: 1.14.0 © gematik – öffentlich Stand: 31.01.2022Spezifikation Verzeichnisdienst Dokumentinformationen Änderungen zur Vorversion Anpassungen des vorliegenden Dokumentes im Vergleich zur Vorversion können Sie der nachfolgenden Tabelle entnehmen. Bitte beachten Sie die Hinweise zur Einführung der Benennungen \'WANDA Basic\' und \'WANDA Smart\' (siehe Dokumentenhistorie). Dokumentenhistorie Version Stand Kap./ Grund der Änderung, besondere Hinweise Bearbeitung Seite 1.2.0 17.07.15 Nutzer der Schnittstelle gematik I_Directory_Maintenance geändert 1.3.0 24.08.16 Anpassungen zum Online-Produktivbetrieb gematik (Stufe 1) 1.4.0 28.10.16 Einarbeitung lt. Änderungsliste gematik 1.5.0 19.04.17 Anpassung nach Änderu

## Create a dataframe of all the chunks

In [15]:
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(308, 3)


Unnamed: 0,text,source,chunk_id
0,Elektronische Gesundheitskarte und Telematikin...,HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt,1b42cf2e35834b15be1b92a83f65586e
1,zur Vorversion können Sie der nachfolgenden Ta...,HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt,982090fa14804745a97e241d19d33a4f
2,Einarbeitung lt. Änderungsliste gematik 1.5.0 ...,HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt,187f305ecd93445689fccaae2ef93846
3,aus Systemdesign R4.0.0 1.11.0 12.11.20 Anpass...,HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt,17652a6a484d4c51a036e59dceadaea7
4,KIM 1.5.1 gemSpec_VZD_V1.docx Spezifikation Se...,HotG_Data/gemSpec_VZD_V1.14.0.pdf.txt,178f90cc581c492cae72eac890c1e661


## Extract Concepts

In [16]:
## To regenerate the graph with LLM, set this to True
##################
regenerate = True  # toggle to True if the time-consuming (re-)generation of the knowlege extraction is required
##################
if regenerate:
#########################################################    
    concepts_list = df2Graph(df, model=modelo)
#########################################################
    dfg1 = graph2Df(concepts_list)
    
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(output_graph_file_with_path, sep=";", index=False)
    df.to_csv(output_chunks_file_with_path, sep=";", index=False)
else:
    dfg1 = pd.read_csv(output_graph_file_with_path, sep=";")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

json-string:
[
   {
       "chunk_id": "1b42cf2e35834b15be1b92a83f65586e",
       "node_1": "Elektronische Gesundheitskarte",
       "node_2": "Telematikinfrastruktur",
       "edge": "Verzeichnisdienst Version: 1.14.0"
   },
   {
       "chunk_id": "1b42cf2e35834b15be1b92a83f65586e",
       "node_1": "Verzeichnisdienst",
       "node_2": "Version: 1.14.0",
       "edge": "Revision: 434138 Stand: 31.01.2022"
   },
   {
       "chunk_id": "1b42cf2e35834b15be1b92a83f65586e",
       "node_1": "Version: 1.14.0",
       "node_2": "Stand: 31.01.2022",
       "edge": "Status: freigegeben Klassifizierung: öffentlich"
   },
   {
       "chunk_id": "1b42cf2e35834b15be1b92a83f65586e",
       "node_1": "Klassifizierung: öffentlich",
       "node_2": "Referenzierung: gemSpec_VZD gemSpec_VZD_V1.docx",
       "edge": "Spezifikation Seite 1 von 72"
   }
]
```

**Explanation:**

* The extracted terms from the context are "Elektronische Gesundheitskarte", "Telematikinfrastruktur", "Verzeichnisdienst", "

ValueError: need at least one array to concatenate

## Calculating contextual proximity

In [None]:
print(dfg1)
dfg2 = contextual_proximity(dfg1)

dfg2.to_csv(output_context_prox_file_with_path, sep=";", index=False)
dfg2.tail()


NameError: name 'dfg1' is not defined

### Merge both the dataframes

dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

In [None]:

# Zusammenführen von dfg1 und dfg2
dfg = pd.concat([dfg1, dfg2], axis=0)

# Konvertieren von 'chunk_id' und 'edge' zu Strings, um den TypeError zu vermeiden
dfg['chunk_id'] = dfg['chunk_id'].astype(str)
dfg['edge'] = dfg['edge'].astype(str)

# Gruppierung und Aggregation
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)

dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,['a concept from extracted ontology'],['implementing'],"9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,9fci...",contextual proximity,2
1,['a concept from extracted ontology'],['related concept from extracted ontology'],9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,"relationship between the two concepts, node_1 ...",4
2,['a_sov'],['fachdaten'],context,one-to-one,4
3,"['anforderung', 'requirement']","['http statuscode', 'tls']",1ffdc7d8cb0b8e4a5,"['related to', 'in one or two sentences']",4
4,['betriebsstätte geburtshilfe'],"['betriebsstätte gesundheits-, kranken- und al...","fe8dfcbf7e6b53c,fe8dfcbf7e6b53c,fe8dfcbf7e6b53...",contextual proximity,4
...,...,...,...,...,...
124,vzd,tabelle3,"b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,b1bd5...",contextual proximity,3
125,vzd,tabelle6: tab_tuc_vzd_february2,"b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,b1bd5...",contextual proximity,3
126,vzd,tabelle7: tab_tuc_vzd_gravity3,"b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,b1bd5...",contextual proximity,6
127,zahnärztekammer,kassenärztliche bundesvereinigung,cc,Separate entities but related by profession,4


## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(78,)

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  26
[["['a concept from extracted ontology']", "['implementing']", "['muss']", "['related concept from extracted ontology']", "['schnittstelle']", "['testunterstützung']", "['vzd']"], ["['a_sov']", "['fachdaten']"], ["['add_vzd_features', 'add_external_id_extensions']", "['umsetzung soap', 'tip1-a_urfz']"], ["['add_vzd_features', 'tip1-a_urfz']", "['umsetzung add_directory_fa-attributes (soap)', 'add_external_id_extensions (soap)']"], ["['anforderung', 'requirement']", "['http statuscode', 'tls']"], ["['aufgebaut', 'built-up or established']", "['id.fd.tls-c', 'identifier of fd.tls-c protocol version']"], ["['betriebsstätte geburtshilfe']", "['betriebsstätte gesundheits-, kranken- und altenpflege']"], ["['betriebsstätte mobile einrichtung rettungsdienst']", "['bundeswehrapotheke']"], ["['case tuc_vzd_urgent4']", "['directorymaintenance.wsdl']", "['directorymaintenance.xsd']", "['dokumente']", "['modify_directory_entry']", "['webservice']"], ["['empfangen', 'empf

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,['a concept from extracted ontology'],#63db57,1
1,['implementing'],#63db57,1
2,['muss'],#63db57,1
3,['related concept from extracted ontology'],#63db57,1
4,['schnittstelle'],#63db57,1
...,...,...,...
73,federal agency for technical education and inf...,#db578c,24
74,föhlerhaft ausgeführt,#a0db57,25
75,mouseout,#a0db57,25
76,interface i_directory_query,#dbd957,26


### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

#graph_output_directory = "./docs/index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="800px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)

# net.show(graph_output_directory)
net.show_buttons(filter_=['physics'])
net.show("knowledge_graph.html")

knowledge_graph.html


In [None]:
# DETAILED STEPS OF TERM PROXIMITY CALCULATION (same as function, only step by step to better understand the process)

In [None]:
## Melt the dataframe into a list of nodes
dfg_long = pd.melt(
    dfg1, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
)

In [None]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,variable,node
99,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,node_2,tabelle6: tab_tuc_vzd_february2
100,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,node_2,tab_tuc_vzd_february2
101,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,node_2,tab_tuc_vzd_february2
102,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,node_2,tab_tuc_vzd_february2
103,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,node_2,tab_tuc_vzd_february2


In [None]:
dfg_long.drop(columns=["variable"], inplace=True)
# Self join with chunk id as the key will create a link between terms occuring in the same text chunk.

In [None]:
dfg_long.tail(5)

Unnamed: 0,chunk_id,node
99,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,tabelle6: tab_tuc_vzd_february2
100,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,tab_tuc_vzd_february2
101,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,tab_tuc_vzd_february2
102,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,tab_tuc_vzd_february2
103,b1bd5a9cc9de1bfunkcrimpierrefe5d9cpubnre,tab_tuc_vzd_february2


In [None]:
dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))

In [None]:
dfg_wide.head()

Unnamed: 0,chunk_id,node_1,node_2
0,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,an entry
1,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,valid certificates
2,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,deleted
3,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,deleted
4,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,valid certificates,an entry


In [None]:
# drop self loops
self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
dfgraph2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

In [None]:
dfgraph2.head()

Unnamed: 0,chunk_id,node_1,node_2
0,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,valid certificates
1,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,deleted
2,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,an entry,deleted
3,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,valid certificates,an entry
4,6fe1f54e-7d8b-4dc9-b2c-e03d80a28687,valid certificates,deleted


## Group and count edges.
dfgraph2 = (
    dfgraph2.groupby(["node_1", "node_2"])
    .agg({"chunk_id": [",".join, "count"]})
    .reset_index()
)

In [None]:
dfgraph2["chunk_id"] = dfgraph2["chunk_id"].astype(str)

# Gruppierung und Aggregation unter der Annahme, dass alle 'chunk_id' Werte nun Strings sind
dfgraph2 = (
    dfgraph2.groupby(["node_1", "node_2"])
    .agg(chunk_ids=("chunk_id", ",".join), count=("chunk_id", "count"))
    .reset_index()
)

# Umbenennen der Spalten für Klarheit (optional, basierend auf Ihrer Präferenz)
dfgraph2.columns = ["node_1", "node_2", "chunk_id", "count"]

In [None]:
dfgraph2.head()

Unnamed: 0,node_1,node_2,chunk_id,count
0,['a concept from extracted ontology'],['implementing'],"9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,9fci...",2
1,['a concept from extracted ontology'],['muss'],9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,1
2,['a concept from extracted ontology'],['related concept from extracted ontology'],9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,1
3,['a concept from extracted ontology'],['schnittstelle'],9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,1
4,['a concept from extracted ontology'],['testunterstützung'],9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,1


In [None]:
dfgraph2.columns = ["node_1", "node_2", "chunk_id", "count"]
dfgraph2.replace("", np.nan, inplace=True)
dfgraph2.dropna(subset=["node_1", "node_2"], inplace=True)
# Drop edges with 1 count
dfgraph2 = dfg2[dfg2["count"] != 1]
dfgraph2["edge"] = "contextual proximity"

In [None]:
dfgraph2.head()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
0,['a concept from extracted ontology'],['implementing'],"9fcisr6bafnrosicmbnbdfd7a6fleshrodfrudder,9fci...",2,contextual proximity
13,['betriebsstätte geburtshilfe'],"['betriebsstätte gesundheits-, kranken- und al...","fe8dfcbf7e6b53c,fe8dfcbf7e6b53c,fe8dfcbf7e6b53...",4,contextual proximity
14,"['betriebsstätte gesundheits-, kranken- und al...",['betriebsstätte geburtshilfe'],"fe8dfcbf7e6b53c,fe8dfcbf7e6b53c,fe8dfcbf7e6b53...",4,contextual proximity
18,['case tuc_vzd_urgent4'],['dokumente'],"7a19d6f0-d58-4ea2-b73e-e8dfae1bda,7a19d6f0-d58...",2,contextual proximity
19,['case tuc_vzd_urgent4'],['modify_directory_entry'],"7a19d6f0-d58-4ea2-b73e-e8dfae1bda,7a19d6f0-d58...",3,contextual proximity


In [None]:
dfgraph2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 94 entries, 0 to 291
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   node_1    94 non-null     object
 1   node_2    94 non-null     object
 2   chunk_id  94 non-null     object
 3   count     94 non-null     int64 
 4   edge      94 non-null     object
dtypes: int64(1), object(4)
memory usage: 4.4+ KB
