In  this notebook , we will develop and optimize llm term extraction based pipelines

prompt used :

You are an helpful assistant helping building an ontology from technical documentation.

Extract the most meaningful keywords of the following text. Keep only keywords that could be concepts and not relations. Write them as a python list of string with double quotes.

Here is an example. Text: This python package is about ontology learning. I do not know a lot about this field. ["python package", "ontology learning", "field"]

Text: {corpus}

In [162]:
import spacy
from typing import Set, List
import pandas as pd
from olaf import Pipeline
from olaf.commons.logging_config import logger
from olaf.data_container import CandidateTerm, Relation, Concept
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.pipeline.pipeline_component.term_extraction import (
    POSTermExtraction,
    TFIDFTermExtraction,
    ManualCandidateTermExtraction
    )
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    CTsToRelationExtraction,
    SynonymRelationExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringRelationExtraction,
    AgglomerativeClusteringConceptExtraction
)
from olaf.commons.spacy_processing_tools import is_not_punct, is_not_stopword, select_on_pos

from olaf.pipeline.pipeline_component.candidate_term_enrichment import SemanticBasedEnrichment

from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

In [163]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from matplotlib_venn import venn2, venn3

In [164]:
nlp = spacy.load("en_core_web_lg")

In [165]:
import torch, gc
def free_gpu():
    gc.collect()
    torch.cuda.empty_cache()

# Select and create relevent concepts from the corpus.


In [166]:
expected_concepts = []
with open("concepts_v2.txt", 'r') as f:
    lines = f.readlines()
    expected_concepts = [concept.rstrip("\n") for concept in lines]
    expected_concepts = [Concept(concept) for concept in expected_concepts]
    f.close()

print(expected_concepts)


[Punching, Welding line, Crescent Gap, Water spot, Oil spot, Silk spot, Inclusion, Rolled pit, Crease, Waist folding, metal surface defect, mechanical failure, drying, mechanical lubricant, temperature, pressure, work roll damage, tension roll damage, local yield, low-carbon, steel strip, folds, rough pockmarked surfaces, metal surface defect, deformation]


In [167]:
corpus_path = "GC10-DET_doc.txt"
corpus_loader = TextCorpusLoader(corpus_path)

In [168]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
sentence_transformer_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Testing concept ratio function

In [169]:


def hg_lm_similaritiry(embedding_a : str, embedding_b: str, threshold=.8):
    return util.pytorch_cos_sim(embedding_a, embedding_b) > threshold

def create_concepts_embedings(concepts: List[Concept], model=sentence_transformer_model) -> List[np.ndarray]:
    concept_labels = [concept.label for concept in concepts]
    concept_embedings = model.encode(concept_labels)
    return concept_embedings


def get_unexpected_concepts(concepts: List[Concept], expected_concepts : List[Concept]):
    unexpected_concepts = []
    concepts = list(concepts)
    concepts_embedings = create_concepts_embedings(concepts)
    expected_concepts_embeding = create_concepts_embedings(expected_concepts)
    for idc, concept_embeding in enumerate(concepts_embedings):
        if not any([hg_lm_similaritiry(concept_embeding, expected_concept_embeding) > 0.7 for expected_concept_embeding in expected_concepts_embeding]):
            unexpected_concepts.append(concepts[idc])
    return unexpected_concepts

def get_concept_ratio(pipeline : Pipeline, expected_concepts : List[Concept], comparator = hg_lm_similaritiry, comparator_args:dict={}, verbose=False) -> tuple:
    """
    Calculate the ratio of expected and unexpected concepts in a given pipeline.

    Parameters
    ----------
    pipeline : Pipeline
        The pipeline object containing concepts.
    expected_concepts : List[Concept]
        A list of expected concepts.kwargs

    Returns
    -------
    Tuple[float, float]: A tuple containing:
        The percentage of expected concepts found in the pipeline.
        The percentage of unexpected concepts in the pipeline.
    """
    
    
    found_concepts = list(pipeline.kr.concepts)
    if len(found_concepts) == 0:
        pipeline.run()
        found_concepts = list(pipeline.kr.concepts)
    
    if len(found_concepts) > 0:
        found_concepts_embedings = create_concepts_embedings(found_concepts)
        expected_concepts_embeding = create_concepts_embedings(expected_concepts)
        expected_concept_occ = 0
        for idx, expected_concept_embeding in enumerate(expected_concepts_embeding):
            if verbose:
                print()
                print(f"{expected_concepts[idx]} : ", end="")
            for idc, found_concept_embeding in enumerate(found_concepts_embedings):
                if comparator(expected_concept_embeding, found_concept_embeding, **comparator_args):
                    expected_concept_occ += 1
                    if verbose:
                        print(f"{found_concepts[idc]} ", end="")
                    break 

        recall = expected_concept_occ/len(expected_concepts)
        precision = expected_concept_occ/len(found_concepts)
        f1 = 2*(precision * recall)/(precision+recall)
        return (precision, recall, f1)
    else:
        return (0, 0, 0)
    
comparator_args = {"threshold":.7}

In [170]:
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_concepts import CTsToConceptExtraction

from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_relations import CTsToRelationExtraction


# concept extraction component
concepts = [
    "defect type", 
    "steel strip surface", 
    "punching", 
    "mechanical failure", 
    "production line", 
    "product specifications", 
    "punching defects", 
    "welding line", 
    "coil", 
    "weld line", 
    "crescent gap", 
    "cutting", 
    "water spot", 
    "drying", 
    "product", 
    "processes", 
    "oil spot", 
    "contamination", 
    "mechanical lubricant", 
    "appearance", 
    "silk spot", 
    "plaque", 
    "strip surface", 
    "density", 
    "temperature", 
    "roller", 
    "pressure", 
    "inclusion", 
    "metal surface", 
    "spots", 
    "fish scale shape", 
    "strip shape", 
    "block irregular distribution", 
    "rough pockmarked surfaces", 
    "plate", 
    "rolled pit", 
    "bulges", 
    "pits", 
    "steel plate", 
    "work roll", 
    "tension roll", 
    "damage", 
    "crease", 
    "fold", 
    "spacing", 
    "local yield", 
    "uncoiling process", 
    "waist folding", 
    "wrinkles", 
    "local deformation", 
    "low-carbon"
]


relations = [
    "described",
    "explaining",
    "appears",
    "leads",
    "resulting",
    "changed",
    "produced",
    "drying",
    "caused",
    "affect",
    "appearing",
    "lies",
    "distributed",
    "accompanied",
    "showing",
    "pressed",
    "occurred",
    "circumvented",
    "detected",
    "tracked",
    "results",
    "like",
    "mainly",
    "uncoiling"
]

ct_concept_label = { concept : {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)


# Usefull function

In [207]:
def display_concept(kr: KnowledgeRepresentation) -> None:
    print("Concepts in KR:")
    for concept in kr.concepts:
        print(concept.label)


def display_relation(kr: KnowledgeRepresentation) -> None:
    print("Relations in KR:")
    for relation in kr.relations:
        if (
            relation.source_concept is not None
            or relation.destination_concept is not None
        ):
            print(
                (
                    relation.source_concept.label,
                    relation.label,
                    relation.destination_concept.label,
                )
            )

def display_metareslation(kr: KnowledgeRepresentation) -> None:
    print(f"{len(kr.metarelations)} MetaRelations in KR:")
    for metarelation in kr.metarelations:
        print(metarelation.source_concept,  metarelation, metarelation.destination_concept)

def describe_pipeline(pipeline: Pipeline) -> None:
    print(pipeline.__class__.__name__)
    for component in pipeline.pipeline_components:
        print(f"\t {component.__class__.__name__}")

# Creating pipelines

In [172]:
from olaf.pipeline.pipeline_component.term_extraction import (
    ManualCandidateTermExtraction,
    POSTermExtraction,
    TFIDFTermExtraction,
    CvalueTermExtraction
)

from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringConceptExtraction
)


In [173]:
concept_extraction_components = ["CandidatToConcept", "SynonymToConcept", "AgglomerativeClustering"]
term_extraction_components = ["LLM Term Extraction", ]
results = pd.DataFrame(
    index=concept_extraction_components,
    columns=term_extraction_components
    )

multi_index = pd.MultiIndex.from_product([
   concept_extraction_components, 
    ["Precision", "Rappel", "F1"]
    ])
pipelines_scores = pd.DataFrame(index=term_extraction_components, columns=multi_index)

pipelines_scores

Unnamed: 0_level_0,CandidatToConcept,CandidatToConcept,CandidatToConcept,SynonymToConcept,SynonymToConcept,SynonymToConcept,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,,,,,,,,,


## LLM Term  Extraction

In [174]:
llm_pipelines = []
llm_results = []

### LLM Term  Extraction and Candidat To Concept Extraction

In [175]:


ct_concept_label = {concept : {concept} for concept in concepts}


llm_pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            CTsToConceptExtraction(),
        ],
        corpus_loader=corpus_loader
    )
)
current_pipeline = llm_pipelines[-1]
# current_pipeline.run()


llm_results += list( 
    results:=get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)
    )
print(results)

(0.5, 1.0, 0.6666666666666666)


### LLM Term  Extraction and Synonym Concept Extraction

In [176]:

llm_pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            SemanticBasedEnrichment(
                threshold=.9
            ),
            SynonymConceptExtraction(),
        ],
            corpus_loader=corpus_loader
    )
)
current_pipeline = llm_pipelines[-1]


llm_results += list(
    results:=get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)
    )
print(results)

(0.5, 1.0, 0.6666666666666666)


### LLM Term  Extraction and Agglomerative Clustering Concept Extraction

In [177]:

llm_pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.3
            ),
        ],
        corpus_loader=corpus_loader
    )
    
)
current_pipeline = llm_pipelines[-1]



llm_results += list(
    results:=get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)
    )
print(results)



(0.5319148936170213, 1.0, 0.6944444444444444)


In [178]:
free_gpu()

## debugging LLMPipeline

In [179]:
agg_pipeline = llm_pipelines[-1]
get_concept_ratio(agg_pipeline, expected_concepts, comparator_args=comparator_args, verbose=True)


Punching : punching 
Welding line : weld line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : fold 
metal surface defect : metal surface 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : temperature 
pressure : pressure 
work roll damage : work roll 
tension roll damage : tension roll 
local yield : local yield 
low-carbon : low-carbon 
steel strip : steel strip surface 
folds : fold 
rough pockmarked surfaces : rough pockmarked surfaces 
metal surface defect : metal surface 
deformation : local deformation 

(0.5319148936170213, 1.0, 0.6944444444444444)

In [180]:
syn_pipeline = llm_pipelines[1]
get_concept_ratio(syn_pipeline, expected_concepts, comparator_args=comparator_args, verbose=True)


Punching : punching 
Welding line : welding line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : waist folding 
metal surface defect : metal surface 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : temperature 
pressure : pressure 
work roll damage : work roll 
tension roll damage : tension roll 
local yield : local yield 
low-carbon : low-carbon 
steel strip : steel strip surface 
folds : waist folding 
rough pockmarked surfaces : rough pockmarked surfaces 
metal surface defect : metal surface 
deformation : local deformation 

(0.5, 1.0, 0.6666666666666666)

In [181]:
for concept in agg_pipeline.kr.concepts:
    print(concept.label, concept.linguistic_realisations)

block irregular distribution {block irregular distribution}
fold {fold}
crescent gap {crescent gap}
local deformation {local deformation}
oil spot {oil spot}
uncoiling process {uncoiling process}
tension roll {tension roll}
work roll {work roll}
density {density}
pressure {pressure}
processes {processes}
low-carbon {low-carbon}
spots {spots}
punching defects {punching defects}
mechanical failure {mechanical failure}
mechanical lubricant {mechanical lubricant}
product specifications {product specifications}
water spot {water spot}
weld line {weld line, welding line}
drying {drying}
metal surface {metal surface}
spacing {spacing}
waist folding {waist folding}
punching {punching}
contamination {contamination}
temperature {temperature}
bulges {bulges}
strip shape {strip shape}
wrinkles {wrinkles}
product {product}
cutting {cutting}
steel strip surface {steel strip surface, strip surface}
plaque {plaque}
pits {pits}
crease {crease}
defect type {defect type}
local yield {local yield}
fish sc

In [182]:
ct_pipeline = llm_pipelines[0]
get_concept_ratio(ct_pipeline, expected_concepts, comparator_args=comparator_args, verbose=True)

for concept in agg_pipeline.kr.concepts:
    print(concept.label, concept.linguistic_realisations)


Punching : punching 
Welding line : weld line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : waist folding 
metal surface defect : metal surface 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : temperature 
pressure : pressure 
work roll damage : work roll 
tension roll damage : tension roll 
local yield : local yield 
low-carbon : low-carbon 
steel strip : steel strip surface 
folds : waist folding 
rough pockmarked surfaces : rough pockmarked surfaces 
metal surface defect : metal surface 
deformation : local deformation block irregular distribution {block irregular distribution}
fold {fold}
crescent gap {crescent gap}
local deformation {local deformation}
oil spot {oil spot}
uncoiling process {uncoiling process}
tension roll {tension roll}
work roll {work roll}
density {density}
pres

In [183]:
get_unexpected_concepts(agg_pipeline.kr.concepts, expected_concepts)

[block irregular distribution,
 local deformation,
 uncoiling process,
 work roll,
 density,
 processes,
 spots,
 punching defects,
 product specifications,
 metal surface,
 spacing,
 contamination,
 bulges,
 strip shape,
 wrinkles,
 product,
 cutting,
 plaque,
 pits,
 defect type,
 fish scale shape,
 production line,
 roller,
 damage,
 steel plate,
 appearance]

### Score des pipelines

In [184]:
pipelines_scores.loc[term_extraction_components[0]] = llm_results
pipelines_scores

Unnamed: 0_level_0,CandidatToConcept,CandidatToConcept,CandidatToConcept,SynonymToConcept,SynonymToConcept,SynonymToConcept,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,0.5,1.0,0.666667,0.5,1.0,0.666667,0.531915,1.0,0.694444


In [185]:
def create_bar_chart(index_name, data):
    precision = data.loc[index_name].xs('Precision', level=1)
    rappel = data.loc[index_name].xs('Rappel', level=1)
    f1 = data.loc[index_name].xs('F1', level=1)

    fig = go.Figure(data=[
        go.Bar(name='Précision', x=data.columns.levels[0], y=precision),
        go.Bar(name='Rappel', x=data.columns.levels[0], y=rappel),
        go.Bar(name='F1', x=data.columns.levels[0], y=f1)
    ])
    
    # Modifier la disposition du graphique
    fig.update_layout(
        title=f'Scores de Précision, Rappel et F1 pour {index_name}',
        xaxis_title='Composants',
        yaxis_title='Scores',
        barmode='group'
    )
    
    fig.show()

data_long = pipelines_scores.reset_index().melt(id_vars='index', var_name=['Composant', 'Métrique'], value_name='Score')
data_long.rename(columns={'index': 'Extraction'}, inplace=True)
data_long = data_long.dropna()

def create_bar_chart(index_name, data):
    df = data[data['Extraction'] == index_name]
    fig = px.bar(df, x='Composant', y='Score', color='Métrique', barmode='group',
                 title=f'Scores de Précision, Rappel et F1 pour {index_name}')
    
    fig.update_layout(
        xaxis_title='Composants',
        yaxis_title='Scores'
    )
    fig.update_layout(width=1000, height=600)
    fig.show()

# Créer un diagramme en barres pour chaque index
for idx in term_extraction_components:
    create_bar_chart(idx, data_long)

# Hierarchisation

In [187]:
from olaf.pipeline.pipeline_component.concept_relation_hierarchy import SubsumptionHierarchisation

## usefull fonctions

## subsumption hierarchy

In [241]:
subsumption_threshold = 0.99

In [242]:
llm_pipelines = [
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            CTsToConceptExtraction(),
            SubsumptionHierarchisation(threshold=subsumption_threshold)
        ],
        corpus_loader=corpus_loader
    ),
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            SemanticBasedEnrichment(
                threshold=.9
            ),
            SynonymConceptExtraction(),
            SubsumptionHierarchisation(threshold=subsumption_threshold)
        ],
            corpus_loader=corpus_loader
    ),
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.3
            ),
            SubsumptionHierarchisation(threshold=subsumption_threshold)
        ],
        corpus_loader=corpus_loader
    )
]

for pipeline in llm_pipelines:
    free_gpu()
    pipeline.run()

ct_pipeline, syn_pipeline, agg_pipeline = llm_pipelines


100%|██████████| 1225/1225 [00:00<00:00, 100735.66it/s]
100%|██████████| 1225/1225 [00:00<00:00, 35601.35it/s]
100%|██████████| 1081/1081 [00:00<00:00, 91220.88it/s]


In [247]:
display_metareslation(agg_pipeline.kr)

35 MetaRelations in KR:
punching defects is_generalised_by punching
metal surface is_generalised_by spots
rolled pit is_generalised_by steel plate
bulges is_generalised_by steel plate
defect type is_generalised_by strip surface
plaque is_generalised_by strip surface
fish scale shape is_generalised_by inclusion
rough pockmarked surfaces is_generalised_by inclusion
block irregular distribution is_generalised_by inclusion
metal surface is_generalised_by inclusion
strip shape is_generalised_by spots
strip shape is_generalised_by inclusion
product specifications is_generalised_by punching
production line is_generalised_by product
contamination is_generalised_by product
appearance is_generalised_by product
mechanical failure is_generalised_by product
punching defects is_generalised_by product
product specifications is_generalised_by product
mechanical lubricant is_generalised_by oil spot
mechanical lubricant is_generalised_by product
spacing is_generalised_by crease
fish scale shape is_gener

## llm hierarchisation

In [270]:
from spacy.tokens import Doc

from olaf.data_container.metarelation_schema import Metarelation

doc_context_max_len = 4000

def find_concept_by_label( label: str, concepts: Set[Concept]) -> Concept:
    """Find a concept based on its label.

    Parameters
    ----------
    label: str
        The label of the wanted concept.
    concepts: Set[Concept]
        The set of concepts to be searched.

    Returns
    -------
    Concept
        The concept with the wanted label.
    """
    selected_concept = None
    for concept in concepts:
        if concept.label == label:
            selected_concept = concept
            break
    return selected_concept

def create_concepts_description(concepts: Set[Concept]) -> str:
        """Create concepts textual description.

        Parameters
        ----------
        concepts: Set[Concept]
            Concepts to describe.

        Returns
        -------
        str
            Textual description of the concepts.
        """
        concepts_description = "Concepts:\n"
        for concept in concepts:
            lrs = [
                lr.label
                for lr in concept.linguistic_realisations
                if not (lr.label == concept.label)
            ]
            if len(lrs):
                concepts_description += f"{concept.label} ({', '.join(lrs)})\n"
            else:
                concepts_description += f"{concept.label}\n"
        return concepts_description

def generate_doc_context( popular_docs: Set[Doc]) -> str:
        """Create context from documents with a fix size.

        Parameters
        ----------
        popular_docs: Set[Doc]
            spaCy docs to fill the context with.

        Returns
        -------
        str
            Concatenation of document contents up to a fixed size.
        """
        context = ""
        for doc in popular_docs:
            if len(doc.text) < doc_context_max_len - len(context):
                context += doc.text
                context += " "
            else:
                context += doc.text[: doc_context_max_len - len(context)]
                break
        return context

def create_hierarchisation_prompt(agg_pipeline):
    popular_docs = set(agg_pipeline.corpus)
    context = generate_doc_context(popular_docs)
    concepts_description = create_concepts_description(pipeline.kr.concepts)


    prompt = """You are an helpful assistant helping building an ontology from technical documentation.
        Based on the context given, define if there is a hierarchy between the listed concepts.
        The result should be given as a python list of list of string with double quotes.

        Here is an example. Concepts: animal, mammal, dog(canine), flower
        [["mammal","is_generalised_by","animal"], ["dog","is_generalised_by","mammal"], ["dog","is_generalised_by","animal"]]
    
        Context: {doc_context}
        {concepts_description}""".format(doc_context=context, concepts_description=concepts_description)
    return prompt

def create_metarelations(
     llm_output: str, concepts: Set[Concept]
) -> Set[Metarelation]:
    """Create metarelations based on the LLM output.

    Parameters
    ----------
    llm_output: str
        Answer of the LLM for the hierarchy.
    concepts: Set[Concept]
        The set of existing concepts.

    Returns
    -------
    Set[Metarelation]
        The metarelations created.
    """
    metarelations = set()
    try:
        for meta_tuple in llm_output:
            source_concept = find_concept_by_label(meta_tuple[0], concepts)
            destination_concept = find_concept_by_label(
                meta_tuple[2], concepts
            )
            if source_concept is not None and destination_concept is not None:
                new_metarelation = Metarelation(
                    source_concept, destination_concept, "is_generalised_by"
                )
                metarelations.add(new_metarelation)
    except (SyntaxError, ValueError):
        logger.error(
            """LLM generator output is not in the expected format. 
            The metarelations can not be extracted."""
        )
    return metarelations

In [266]:
ct_prompt = create_hierarchisation_prompt(ct_pipeline)
print(ct_prompt)

You are an helpful assistant helping building an ontology from technical documentation.
        Based on the context given, define if there is a hierarchy between the listed concepts.
        The result should be given as a python list of list of string with double quotes.

        Here is an example. Concepts: animal, mammal, dog(canine), flower
        [["mammal","is_generalised_by","animal"], ["dog","is_generalised_by","mammal"], ["dog","is_generalised_by","animal"]]
    
        Context:     Water spot: A water spot is produced by drying in production. Under different products and processes, the requirements for this defect are different. However, because the water spots are generally with low contrast, and are similar to other defects such as oil spots, they are usually detected by mistake.
     Oil spot: An oil spot is usually caused by the contamination of mechanical lubricant, which will affect the appearance of the product.
     Crescent gap: In the production of steel strip, 

In [271]:
import ast



llm_output = [
    ["weld line", "is_generalised_by", "welding line"],
    ["crease", "is_generalised_by", "defect"],
    ["mechanical lubricant", "is_part_of", "mechanical system"],
    ["roller", "is_part_of", "production line"],
    ["pressure", "is_related_to", "roller"],
    ["product specifications", "is_part_of", "product"],
    ["damage", "is_generalised_by", "defect"],
    ["weld line", "is_part_of", "welding process"],
    ["drying", "is_part_of", "production process"],
    ["product", "is_related_to", "production line"]
]

create_metarelations(llm_output, ct_pipeline.kr.concepts)

{meta : (product, is_generalised_by, production line),
 meta : (weld line, is_generalised_by, welding line),
 meta : (pressure, is_generalised_by, roller),
 meta : (product specifications, is_generalised_by, product),
 meta : (roller, is_generalised_by, production line)}