In this notebook we aim to develop a protocole to evaluate OLAF relation extraction  components:


To achieve this task , we will follow this steps:

- Select a corpus.
- Select and create relevent concepts from the corpus.
- Create several pipelines with different components and parameters.
- Run all the pipelines.
- Find concepts involved in complete triples (relation with no null source and destination concepts) for each pipeline.
- Etablish the matching percentage of found concepts compared to selected concepts on step 2.


In [1]:
import spacy
from typing import Set, List
import pandas as pd
from olaf import Pipeline
from olaf.commons.logging_config import logger
from olaf.data_container import CandidateTerm, Relation, Concept
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.pipeline.pipeline_component.term_extraction import (
    POSTermExtraction,
    TFIDFTermExtraction,
    ManualCandidateTermExtraction
    )
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    CTsToRelationExtraction,
    SynonymRelationExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringRelationExtraction,
    AgglomerativeClusteringConceptExtraction,
    LLMBasedRelationExtraction
)
from olaf.commons.spacy_processing_tools import is_not_punct, is_not_stopword, select_on_pos

from olaf.pipeline.pipeline_component.candidate_term_enrichment import SemanticBasedEnrichment

from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

  from tqdm.autonotebook import tqdm, trange


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from matplotlib_venn import venn2, venn3

In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
import torch, gc
def free_gpu():
    """
    frees up the GPU cache for other calculations. 
    """
    gc.collect()
    torch.cuda.empty_cache()

free_gpu()

# Select Corpus

In [5]:
corpus_path = "../data/GC10-DET_doc.txt"
corpus_loader = TextCorpusLoader(corpus_path)

# Select manually and create relevent relation from the corpus.


In [6]:
import json
import re

def format_concept_or_label(text: str) -> str:
    """
    Replace underscores with spaces and convert to lower case
    """
    
    return text.replace('_', ' ').lower()

def format_camel_case(text: str) -> str:
    """Replace underscores with spaces and convert to lower case, 
    add spaces between words in camel case.
    """
    text = text.replace('_', ' ')
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', text).lower()

expected_concepts = []
with open("concepts.txt", 'r') as f:
    lines = f.readlines()
    expected_concepts = [concept.rstrip("\n") for concept in lines]
    expected_concepts = [Concept(concept) for concept in expected_concepts]
    f.close()


with open("../data/relations.json", 'r', encoding='utf-8') as file:
    expected_relations = json.load(file)

expected_relations = [(format_concept_or_label(concept_source),
                       format_camel_case(relation_label),
                       format_concept_or_label(concept_destination))
                      for concept_source, relation_label, concept_destination in expected_relations]


expected_relations = { Relation(relation[1], Concept(relation[0]), Concept(relation[2])) for relation in expected_relations}
expected_relations

{(steel strip, has abnormal, appearance),
 (product, has abnormal, appearance),
 (crease, has appearance, vertical),
 (rolled pit, has appearance, periodic bulges or pits),
 (defect, has appearance, appearance),
 (crescent gap, has appearance, half circle),
 (silk spot, has appearance, wave like plaque),
 (inclusion, has appearance, spot),
 (waist folding, has appearance, wrinkles like),
 (punching, is caused by, mechanical failure),
 (crease, is caused by, local yield),
 (crescent gap, is caused by, cutting),
 (oil spot, is caused by, mechanical lubricant),
 (waist folding, is caused by, low carbon),
 (defect, is caused by, cause),
 (water spot, is caused by, drying),
 (roller, is part of, machine),
 (machine, is part of, factory),
 (factory, is part of, factory),
 (production line, is part of, factory),
 (steel strip, is produced by, machine),
 (product, is produced by, factory)}

In [7]:
labels = set([relation.label for relation in expected_relations])
labels

{'has abnormal',
 'has appearance',
 'is caused by',
 'is part of',
 'is produced by'}

# Testing relation ratio function

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util




sentence_transformer_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
comparator_args={"threshold": 0.6}

def hg_lm_similaritiry(embedding_a : np.ndarray, embedding_b: np.ndarray, threshold :float=.8):
    """compute the cosine similarity between two vectors
    """
    return util.pytorch_cos_sim(embedding_a, embedding_b) > threshold

def create_concepts_embedings(concepts: List[Concept], model=sentence_transformer_model) -> List[np.ndarray]:
    """
    Create a embedding list of concepts label from a list of Concepts.
    """
    concept_labels = [concept.label for concept in concepts]
    return model.encode(concept_labels)


def create_relations_embedings(relations: List[Relation], model=sentence_transformer_model) -> List[np.ndarray]:
    """Create a embedding list of relation relation  source concept 
    and destination concept from a list of Relation.
    """
    return [
            (
                model.encode(relation.source_concept.label), 
                # model.encode(relation.label), 
                model.encode(relation.destination_concept.label)
            ) for relation in relations
            ]


def get_unexpected_concepts(concepts: List[Concept], expected_concepts : List[Concept]):
    """
        Return all the concepts obtained that do not appear in those expected.
    """
    concepts = list(concepts)
    concepts_embedings = create_concepts_embedings(concepts)
    expected_concepts_embeding = create_concepts_embedings(expected_concepts)
    return [
        concepts[idc]
        for idc, concept_embeding in enumerate(concepts_embedings)
        if all(
            hg_lm_similaritiry(concept_embeding, expected_concept_embeding)
            <= 0.7
            for expected_concept_embeding in expected_concepts_embeding
        )
    ]

def is_valid_relation(relation : Relation):
    """Check relation validity
    """
    return relation.source_concept is not None and relation.destination_concept is not None


def are_equivalent(rel_embeding_a : np.ndarray, rel_embeding_b : np.ndarray) -> bool:
    return all(hg_lm_similaritiry(embeding_a, embeding_b) for embeding_a, embeding_b in zip(rel_embeding_a, rel_embeding_b)) \
        or all(hg_lm_similaritiry(embeding_a, embeding_b) for embeding_a, embeding_b in zip(rel_embeding_b, rel_embeding_a))


def get_relation_ratio(pipeline : Pipeline, expected_relations : List[Relation], comparator = hg_lm_similaritiry, comparator_args:dict={}, verbose=False) -> tuple:
    """
    Calculates pipeline performance using precision, recall and f-score
    
    Parameters
    ----------
    pipeline : Pipeline
        The pipeline object containing relations.
    expected_relations : List[Relation]
        A list of expected relations.kwargs

    Returns
    -------
    Tuple[float, float, float]: A tuple containing:
        precision
        recall
        f-score
    """
    
    
    found_relations = pipeline.kr.relations
    if len(found_relations) <= 0:
        return (0, 0, 0)
    found_relations = [relation for relation in found_relations if is_valid_relation(relation)]
    expected_relations = list(expected_relations)
    cooccurrence_count = 0

    found_relations_embedings =  create_relations_embedings(found_relations)
    expected_relations_embeding =  create_relations_embedings(expected_relations)

    for idx1, r1 in enumerate(expected_relations_embeding):
        if verbose:
            print(f"\n{str(expected_relations[idx1])} : ", end= "")
        for idx2, r2 in enumerate(found_relations_embedings):
            if are_equivalent(r1, r2) :
                if verbose:
                    print(f"{found_relations[idx2]}, ", end= "")
                cooccurrence_count += 1
    if cooccurrence_count == 0:
        return (0, 0, 0)
    recall = cooccurrence_count/len(expected_relations)
    precision = cooccurrence_count/len(found_relations)
    f1 = 2*(precision * recall)/(precision+recall)
    return (precision, recall, f1)

In [10]:
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_concepts import CTsToConceptExtraction

from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_relations import CTsToRelationExtraction


# a list of candidate terms by a LLM to create concepts
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

""" prompt
    You are an helpful assistant helping building an ontology of technical documentation of quality defects.
    Extract the most meaningful words describing defects ans their causes, appearance of products. 
    we will use this list of relations to extract relations between concepts : ['has abnormal', 'has appearance', 'is caused by', 'is part of', 'is produced by']
    Keep only words that could be relations and not concepts.
    Write them as a python list of string with double quotes.
    
    Text: 
"""

# a list of candidate terms by a LLM to create relation
relations = [
    "needs to be punched",
    "may lead to",
    "resulting in",
    "fold",
    "moving",
    "produced by",
    "are different",
    "detected by mistake",
    "described in detail",
    "explaining",
    "appears",
    "showing",
    "accompanied by",
    "fall off",
    "pressed into",
    "changed",
    "weld",
    "needs to be detected",
    "tracked",
    "circumvented",
    "indicating",
    "caused by",
    "affect",
    "distributed",
    "cutting",
    "appear"
]


ct_concept_label = { concept : {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)

concept_extraction = CTsToConceptExtraction(
)


# Usefull function

In [11]:
def display_concept(kr : KnowledgeRepresentation) -> None:
    """
    Display the concepts contained in a KnowledgeRepresentation

    Parameters
    ----------
    kr : KnowledgeRepresentation
        The Knowledge Representation containing the concepts and relations.

    Returns
    -------
    None.
    """
    print("Concepts in KR:")
    for concept in kr.concepts:
        print(concept.label)


def display_relation(kr: KnowledgeRepresentation) -> None:
    """
    Display the relations contained in a KnowledgeRepresentation

    Parameters
    ----------
    kr : KnowledgeRepresentation
        The Knowledge Representation containing the concepts and relations.

    Returns
    -------
    None.
    """
    print("Relations in KR:")
    for relation in kr.relations:
        if (
            relation.source_concept is not None
            or relation.destination_concept is not None
        ):
            print(
                (
                    relation.source_concept.label,
                    relation.label,
                    relation.destination_concept.label,
                )
            )

def describe_pipeline(pipeline: Pipeline) -> None:
    """
    Display all pipeline components class names

    Parameters
    ----------
    pipeline : Pipeline
        the pipeline to describe.

    Returns
    -------
    None.
    """
    print(pipeline.__class__.__name__)
    for component in pipeline.pipeline_components:
        print(f"\t {component.__class__.__name__}")

In [12]:


def create_bar_chart(index_name, pipelines_scores):
    data = pipelines_scores.reset_index().melt(id_vars='index', var_name=['Composant', 'Métrique'], value_name='Score')
    data.rename(columns={'index': 'Extraction'}, inplace=True)

    df = data[data['Extraction'] == index_name]
    fig = px.bar(df, x='Composant', y='Score', color='Métrique', barmode='group',
                 title=f'Scores de Précision, Rappel et F1 pour {index_name}')
    
    fig.update_layout(
        xaxis_title='Composants',
        yaxis_title='Scores'
    )
    fig.update_layout(width=1000, height=600)
    fig.show()



# Creating pipelines

In [13]:
relation_extraction_components = ["CandidatToRelation", "SynonymToRelation", "AgglomerativeClustering"]
term_extraction_components = ["LLM Term Extraction", "POStag Term Extraction", "TFIDF Term Extraction"]
results = pd.DataFrame(
    index=relation_extraction_components,
    columns=term_extraction_components
    )

multi_index = pd.MultiIndex.from_product([
    relation_extraction_components, 
    ["Precision", "Rappel", "F1"]
    ])
pipelines_scores = pd.DataFrame(index=term_extraction_components, columns=multi_index)

pipelines_scores

Unnamed: 0_level_0,CandidatToRelation,CandidatToRelation,CandidatToRelation,SynonymToRelation,SynonymToRelation,SynonymToRelation,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,,,,,,,,,
POStag Term Extraction,,,,,,,,,
TFIDF Term Extraction,,,,,,,,,


## LLM Term  Extraction

In [14]:
llm_pipelines = [None, None, None]
llm_results = np.zeros(9)


### LLM Term  Extraction and Candidat To Relation Extraction

In [15]:
idx = 0

In [17]:

# a list of candidate terms by a LLM to create concepts
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

ct_concept_label = { concept : {concept} for concept in concepts}



llm_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            ManualCandidateTermExtraction(
                ct_label_strings_map={ relation : {relation} for relation in relations}
            ),
            CTsToRelationExtraction(
                concept_max_distance=5
            )
        ],
        corpus_loader=corpus_loader
    )


free_gpu()
current_pipeline = llm_pipelines[idx]
current_pipeline.run()


llm_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args, verbose=True)
    )

print(results)




is caused by : 
has appearance : 
has appearance : 
is caused by : 
has appearance : 
is part of : 
has appearance : 
has appearance : 
is caused by : 
has abnormal : 
has appearance : 
is produced by : 
is part of : 
has appearance : 
is caused by : caused by, 
has abnormal : 
is part of : 
is caused by : 
is caused by : 
is part of : 
is produced by : 
is caused by : produced by, (0.4, 0.09090909090909091, 0.14814814814814814)


In [19]:
display_relation(current_pipeline.kr)

Relations in KR:
('water spot', 'produced by', 'drying')
('oil spot', 'caused by', 'mechanical lubricant')
('metal surface', 'showing', 'water spot')
('mechanical failure', 'may lead to', 'punching')
('punching', 'resulting in', 'punching')


In [None]:
idx += 1

### LLM Term  Extraction and Synonym Relation Extraction

In [None]:


llm_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
           ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            ManualCandidateTermExtraction(
                ct_label_strings_map={ relation : {relation} for relation in relations}
            ),
           SynonymRelationExtraction(
               concept_max_distance=5
           )
        ],
        corpus_loader=corpus_loader
    )



free_gpu()
current_pipeline = llm_pipelines[idx]
current_pipeline.run()


llm_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)



(0.4, 0.09090909090909091, 0.14814814814814814)


In [None]:
idx += 1

### LLM Term  Extraction and Agglomerative Clustering Reltation Extraction

In [None]:
llm_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            ManualCandidateTermExtraction(
                ct_label_strings_map={ relation : {relation} for relation in relations}
            ),
            AgglomerativeClusteringRelationExtraction(
                concept_max_distance=8
            )
        ],
        corpus_loader=corpus_loader
    )

free_gpu()
current_pipeline = llm_pipelines[idx]
current_pipeline.run()


llm_results[3*idx: 3*idx + 3] = list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)



(0.2857142857142857, 0.09090909090909091, 0.13793103448275862)


### Score des pipelines

In [None]:
llm_results

array([0.4       , 0.09090909, 0.14814815, 0.4       , 0.09090909,
       0.14814815, 0.28571429, 0.09090909, 0.13793103])

In [None]:
pipelines_scores.loc[term_extraction_components[0]] = llm_results
pipelines_scores

Unnamed: 0_level_0,CandidatToRelation,CandidatToRelation,CandidatToRelation,SynonymToRelation,SynonymToRelation,SynonymToRelation,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,0.4,0.090909,0.148148,0.4,0.090909,0.148148,0.285714,0.090909,0.137931
POStag Term Extraction,0.0,0.0,0.0,0.090909,0.045455,0.060606,0.1,0.045455,0.0625
TFIDF Term Extraction,0.090909,0.136364,0.109091,0.242424,0.363636,0.290909,0.058824,0.045455,0.051282


In [None]:
create_bar_chart("LLM Term Extraction", pipelines_scores)

## POS tag Term Extraction

In [None]:
postag_pipelines = [None, None, None]
pos_results = np.ones(9)
idx = 0

### POS tag Term  extraction and Candidat To Concept Extraction

In [None]:

postag_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
            ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            POSTermExtraction(
                pos_selection=["VERB", "ADJ"]
            ),
            CTsToRelationExtraction(
                concept_max_distance=8
            )
        ],
        corpus_loader=corpus_loader
    )


free_gpu()
current_pipeline = postag_pipelines[idx]
current_pipeline.run()


pos_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

               By default the system will use the entire content of the document.]


(0.08333333333333333, 0.09090909090909091, 0.08695652173913043)


In [None]:
display_relation(current_pipeline.kr)

Relations in KR:
('crease', 'vertical', 'waist folding')
('inclusion', 'typical', 'metal surface')
('mechanical failure', 'unwanted', 'punching')
('rolled pit', 'rolled', 'pits')
('pits', 'periodic', 'pits')
('pits', 'periodic', 'bulges')
('mechanical failure', 'lead', 'punching')
('crease', 'transverse', 'waist folding')
('rolled pit', 'rolled', 'bulges')
('spots', 'produced', 'drying')
('punching', 'resulting', 'punching')
('roller', 'uneven', 'pressure')
('metal surface', 'showing', 'spots')


In [None]:
idx += 1

### POS tag Term  extraction and Synonym Concept Extraction

In [None]:
postag_pipelines[idx] = Pipeline(
    spacy_model=nlp,
    pipeline_components=[
        ManualCandidateTermExtraction(
            ct_label_strings_map=ct_concept_label
        ),
        AgglomerativeClusteringConceptExtraction(
            distance_threshold=.4
        ),
        POSTermExtraction(
            pos_selection=["VERB", "ADJ"]
        ),
        SynonymRelationExtraction(
            concept_max_distance=8
        )
    ],
    corpus_loader=corpus_loader
)

free_gpu()
current_pipeline = postag_pipelines[idx]
current_pipeline.run()


pos_results[3*idx: 3*idx + 3]= list(
    result:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

               By default the system will use the entire content of the document.]


(0.08333333333333333, 0.09090909090909091, 0.08695652173913043)


In [None]:
idx += 1

### POS tag Term  extraction and Agglomerative clustering Extraction

In [None]:
postag_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            POSTermExtraction(
                pos_selection=["VERB", "ADJ"]
            ),
            AgglomerativeClusteringRelationExtraction(
               concept_max_distance=8
           )
        ],
        corpus_loader=corpus_loader
    )



free_gpu()
current_pipeline = postag_pipelines[idx]
current_pipeline.run()


pos_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

               By default the system will use the entire content of the document.]


(0.10526315789473684, 0.09090909090909091, 0.0975609756097561)


In [None]:
pos_results

array([0.08333333, 0.09090909, 0.08695652, 0.08333333, 0.09090909,
       0.08695652, 0.10526316, 0.09090909, 0.09756098])

### Score des pipelines

In [None]:
pipelines_scores.loc[term_extraction_components[1]] = pos_results
pipelines_scores

Unnamed: 0_level_0,CandidatToRelation,CandidatToRelation,CandidatToRelation,SynonymToRelation,SynonymToRelation,SynonymToRelation,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,0.4,0.090909,0.148148,0.4,0.090909,0.148148,0.285714,0.090909,0.137931
POStag Term Extraction,0.083333,0.090909,0.086957,0.083333,0.090909,0.086957,0.105263,0.090909,0.097561
TFIDF Term Extraction,0.090909,0.136364,0.109091,0.242424,0.363636,0.290909,0.058824,0.045455,0.051282


In [None]:
create_bar_chart("POStag Term Extraction", pipelines_scores)

## TFIDF  Term Extraction

In [33]:
tfidf_pipelines = [None, None, None]
tfidf_results = np.ones(9)
idx = 0

In [21]:
from spacy.matcher import Matcher

def relation_postprocessor(relations : Set[Relation], nlp=nlp) -> Set[Relation]:
    """Filters relationships based on a set of spacy patterns.

    Parameters
    ----------
    relations : Set[Relation]
        A set of relation to filter
    nlp : spacy.Language
        the spacy language

    Returns
    -------
    Set[Relation]
        A set of Relation
    """
    correct_relations = set()
    relation_patterns = [
        [{"POS": "AUX", "DEP": "ROOT"}],
        [{"POS": "AUX", "OP": "?"}, {"POS": "ADV", "OP": "?"},{"POS": "VERB"}, {"POS": "ADP", "OP": "?"}],
        [{"POS": "AUX"}, {"POS": "ADJ", "OP": "+"}, {"POS": "ADP"}],
        [{"POS": "AUX"}, {"POS": "VERB", "OP": "+"}, {"POS": "ADP", "OP": "?"}],
        ]
    matcher = Matcher(nlp.vocab)

    matcher.add("REALTION_PATTERN", relation_patterns)

    for relation in relations:
        relation_doc = nlp(relation.label)
        matches = matcher(relation_doc)
        if any(
            len(relation_doc[start_idx:end_idx]) == len(relation_doc)
            for _, start_idx, end_idx in matches
        ):
            correct_relations.add(relation)
    
    return correct_relations


### TFIDF Term Extraction and Candidat To Concept Extraction

In [34]:

tfidf_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
           ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            TFIDFTermExtraction(max_term_token_length=4, cts_post_processing_functions=[relation_postprocessor]),
            CTsToRelationExtraction(
               concept_max_distance=8
           )
        ],
        corpus_loader=corpus_loader
    )


free_gpu()
current_pipeline = tfidf_pipelines[idx]
current_pipeline.run()


tfidf_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

                By default the system will use the entire content of the document.]

The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



(0.13725490196078433, 0.3181818181818182, 0.19178082191780824)


In [35]:
display_relation(current_pipeline.kr)

Relations in KR:
('metal surface', 'usually showing', 'fish scale shape')
('defect type', 'explaining', 'steel strip surface')
('metal surface', 'showing', 'spots')
('spots', 'is produced by', 'drying')
('pits', 'are', 'bulges')
('pits', 'bulges', 'pits')
('metal surface', 'usually showing', 'spots')
('inclusion', 'surface', 'spots')
('mechanical failure', 'may lead', 'punching')
('pits', 'are', 'pits')
('mechanical failure', 'lead', 'punching')
('rolled pit', 'rolled', 'pits')
('punching', 'resulting in', 'punching')
('bulges', 'surface', 'steel plate')
('spots', 'is produced', 'drying')
('work roll', 'roll', 'work roll')
('crease', 'crease', 'waist folding')
('mechanical failure', 'may', 'punching')
('rolled pit', 'are', 'pits')
('spots', 'scale', 'block irregular distribution')
('oil spot', 'is usually caused', 'mechanical lubricant')
('spots', 'produced', 'drying')
('mechanical failure', 'may lead to', 'punching')
('mechanical failure', 'resulting', 'punching')
('spots', 'are', 'sp

: 

In [23]:
idx += 1

### TFIDF Term Extraction and Synonym Relation Extraction

In [None]:

tfidf_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            TFIDFTermExtraction(
                max_term_token_length=4,
                candidate_term_threshold=.01,
                cts_post_processing_functions=[relation_postprocessor]
            ),
            SynonymRelationExtraction(
                concept_max_distance=8
            )
        ],
        corpus_loader=corpus_loader
    )


free_gpu()
current_pipeline = tfidf_pipelines[idx]
current_pipeline.run()


tfidf_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

                By default the system will use the entire content of the document.]


(0.13725490196078433, 0.3181818181818182, 0.19178082191780824)


In [25]:
idx += 1

### TFIDF Term Extraction and Agglomerative clustering Relation Extraction

In [26]:

tfidf_pipelines[idx] = Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.4
            ),
            TFIDFTermExtraction(
                max_term_token_length=4, 
                candidate_term_threshold=.01,
                cts_post_processing_functions=[relation_postprocessor]
            ),
            AgglomerativeClusteringRelationExtraction(
                concept_max_distance=6
            )
        ],
        corpus_loader=corpus_loader
    )


free_gpu()
current_pipeline = tfidf_pipelines[idx]
current_pipeline.run()


tfidf_results[3*idx: 3*idx + 3]= list(
    results:=get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)
    )

print(results)

                By default the system will use the entire content of the document.]


(0.11764705882352941, 0.09090909090909091, 0.10256410256410256)


In [27]:
display_relation(current_pipeline.kr)

Relations in KR:
('inclusion', 'defect', 'metal surface')
('fish scale shape', 'shape', 'block irregular distribution')
('tension roll', 'roll', 'damage')
('rolled pit', 'are', 'pits')
('crease', 'crease', 'waist folding')
('water spot', 'is produced by', 'drying')
('tension roll', 'roll', 'tension roll')
('metal surface', 'showing', 'water spot')
('metal surface', 'usually showing', 'fish scale shape')
('rolled pit', 'are', 'bulges')
('mechanical failure', 'lead to', 'punching')
('punching', 'resulting', 'punching')
('pits', 'are', 'bulges')
('pits', 'surface', 'steel plate')
('pits', 'bulges', 'pits')
('bulges', 'surface', 'steel plate')
('oil spot', 'caused by', 'mechanical lubricant')


In [28]:
idx

2

### Scores des pipelines

In [29]:
pipelines_scores.loc[term_extraction_components[2]] = tfidf_results
pipelines_scores

Unnamed: 0_level_0,CandidatToRelation,CandidatToRelation,CandidatToRelation,SynonymToRelation,SynonymToRelation,SynonymToRelation,AgglomerativeClustering,AgglomerativeClustering,AgglomerativeClustering
Unnamed: 0_level_1,Precision,Rappel,F1,Precision,Rappel,F1,Precision,Rappel,F1
LLM Term Extraction,,,,,,,,,
POStag Term Extraction,,,,,,,,,
TFIDF Term Extraction,0.235294,0.545455,0.328767,0.137255,0.318182,0.191781,0.117647,0.090909,0.102564


In [31]:
create_bar_chart("TFIDF Term Extraction", pipelines_scores)

# last attempt

In this attempt we submit the concepts, the relation labels would like to have and the corpus .

In [None]:
llm_output = [
    ["water spot", "is produced by", "drying"],
    ["water spot", "is produced by", "production"],
    ["oil spot", "is caused by", "contamination"],
    ["contamination", "is caused by", "mechanical lubricant"],
    ["oil spot", "has appearance", "product"],
    ["crescent gap", "is caused by", "cutting"],
    ["weld line", "is part of", "strip"],
    ["inclusion", "has appearance", "small spots"],
    ["inclusion", "has appearance", "fish scale shape"],
    ["inclusion", "has appearance", "strip shape"],
    ["inclusion", "has appearance", "block irregular distribution"],
    ["inclusion", "is part of", "upper surface"],
    ["inclusion", "is part of", "lower surface"],
    ["inclusion", "is accompanied by", "rough pockmarked surfaces"],
    ["crease", "has appearance", "vertical transverse fold"],
    ["crease", "has abnormal", "spacing"],
    ["crease", "is caused by", "local yield"],
    ["crease", "is part of", "strip"],
    ["silk spot", "has appearance", "plaque"],
    ["silk spot", "is caused by", "uneven temperature"],
    ["silk spot", "is caused by", "uneven pressure"],
    ["waist folding", "has appearance", "obvious folds"],
    ["waist folding", "has appearance", "wrinkles"],
    ["waist folding", "is caused by", "local deformation"],
    ["waist folding", "is caused by", "low-carbon"],
    ["punching", "is produced by", "production line"],
    ["punching", "is produced by", "strip"],
    ["punching", "is caused by", "mechanical failure"],
    ["punctate", "is part of", "rolled pit"],
    ["rolled pit", "has appearance", "bulges"],
    ["rolled pit", "has appearance", "pits"],
    ["rolled pit", "is caused by", "work roll"],
    ["rolled pit", "is caused by", "tension roll"],
    ["rolled pit", "is part of", "steel plate"]
]


In [None]:
expected_relations

{(product, has abnormal, appearance),
 (steel strip, has abnormal, appearance),
 (crease, has appearance, vertical),
 (waist folding, has appearance, wrinkles like),
 (silk spot, has appearance, wave like plaque),
 (defect, has appearance, appearance),
 (inclusion, has appearance, spot),
 (crescent gap, has appearance, half circle),
 (rolled pit, has appearance, periodic bulges or pits),
 (defect, is caused by, cause),
 (punching, is caused by, mechanical failure),
 (crescent gap, is caused by, cutting),
 (crease, is caused by, local yield),
 (waist folding, is caused by, low carbon),
 (oil spot, is caused by, mechanical lubricant),
 (water spot, is caused by, drying),
 (factory, is part of, factory),
 (roller, is part of, machine),
 (machine, is part of, factory),
 (production line, is part of, factory),
 (product, is produced by, factory),
 (steel strip, is produced by, machine)}

In [None]:
found_relations = {
   Relation(
       relation[1], 
       Concept(relation[0]), 
       Concept(relation[2])
       ) for relation in llm_output
}


pipeline = Pipeline(
    spacy_model=nlp,
    pipeline_components=[
        ManualCandidateTermExtraction(
            ct_label_strings_map=ct_concept_label
        ),
        AgglomerativeClusteringConceptExtraction(
            distance_threshold=.4
        )
    ],
    corpus_loader=corpus_loader
)

pipeline.run()

pipeline.kr.relations = found_relations

get_relation_ratio(current_pipeline, expected_relations, comparator_args=comparator_args)



(0.058823529411764705, 0.045454545454545456, 0.05128205128205128)