In this notebook we aim to develop a protocole to evaluate OLAF pipelines:


To achieve this task , we will follow this steps:

- Select a corpus.
- Select and create relevent concepts from the corpus.
- Create several pipelines with different components and parameters.
- Run all the pipelines.
- Find concepts involved in complete triples (relation with no null source and destination concepts) for each pipeline.
- Etablish the matching percentage of found concepts compared to selected concepts on step 2.


In [45]:
import spacy
from typing import Set, List

from olaf import Pipeline
from olaf.data_container.relation_schema import Relation, Concept
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    CTsToRelationExtraction,
    SynonymRelationExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringRelationExtraction,
    AgglomerativeClusteringConceptExtraction
)
from olaf.pipeline.pipeline_component.term_extraction.tfidf_term_extraction import (
    TFIDFTermExtraction,
)
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

In [46]:
nlp = spacy.load("en_core_web_lg")

# Select Corpus

In [47]:
corpus_path = "GC10-DET_doc.txt"
corpus = TextCorpusLoader(corpus_path)._read_corpus()
corpus = [doc[:-1] for doc in corpus]
corpus

['Each defect type is described in detail, explaining how it appears on the steel strip surface and the reasons behind its occurrence:',
 '    Punching: In the production line of the strip, the steel strip needs to be punched according to the product specifications; mechanical failure may lead to unwanted punching, resulting in punching defects.',
 '    Welding line: When the strip is changed, it is necessary to weld the two coils of the strip, and the weld line is produced. Strictly speaking, this is not a defect, but it needs to be automatically detected and tracked to be circumvented in subsequent cuts.',
 '    Crescent gap: In the production of steel strip, cutting sometimes results in defects, just like half a circle.',
 '    Water spot: A water spot is produced by drying in production. Under different products and processes, the requirements for this defect are different. However, because the water spots are generally with low contrast, and are similar to other defects such as oi

# Select and create relevent concepts from the corpus.


In [48]:
"knfjre\n".rstrip("\n")

'knfjre'

In [49]:
expected_concepts = []
with open("concepts.txt", 'r') as f:
    lines = f.readlines()
    expected_concepts = [concept.rstrip("\n") for concept in lines]
    expected_concepts = [Concept(concept) for concept in expected_concepts]
    f.close()

print(expected_concepts)


[Punching, Welding line, Crescent Gap, Water spot, Oil spot, Silk spot, Inclusion, Rolled pit, Crease, Waist folding, metal surface defect, mechanical failure, drying, mechanical lubricant, temperature, pressure, work roll damage, tension roll damage, local yield, low-carbon]


# Testing concept ratio function

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util


def is_similar(concept_a : str, concept_b: str, nlp =nlp, threshold=.8):
    vector_a, vector_b = nlp(concept_a).vector, nlp(concept_b).vector
    return cosine_similarity([vector_a], [vector_b]) > threshold

def is_equal(concept_a : str, concept_b: str):
    return concept_a.lower() == concept_b.lower()

def hg_lm_similaritiry(concept_a : str, concept_b: str, model=SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"), threshold=.8):
    embedding_a, embedding_b = model.encode(concept_a), model.encode(concept_b)
    return util.pytorch_cos_sim(embedding_a, embedding_b) > threshold

def get_concept_ratio(pipeline : Pipeline, expected_concepts : List[Concept], comparator = hg_lm_similaritiry, comparator_args:dict={}) -> tuple:
    """
    Calculate the ratio of expected and unexpected concepts in a given pipeline.

    Parameters
    ----------
    pipeline : Pipeline
        The pipeline object containing concepts.
    expected_concepts : List[Concept]
        A list of expected concepts.kwargs

    Returns
    -------
    Tuple[float, float]: A tuple containing:
        The percentage of expected concepts found in the pipeline.
        The percentage of unexpected concepts in the pipeline.
    """
    
    
    found_concepts = pipeline.kr.concepts
    found_concepts = [found_concept.label for found_concept in found_concepts]
    expected_concepts = [expected_concept.label for expected_concept in expected_concepts]
    expected_concept_occ = 0
    for expected_concept in expected_concepts:
        for found_concept in found_concepts:
            if comparator(expected_concept, found_concept, **comparator_args):
                expected_concept_occ += 1
                break 

  
    precision = expected_concept_occ/len(expected_concepts)
    recall = expected_concept_occ/len(found_concepts)
    f1 = 2*(precision * recall)/(precision+recall)
    return (precision, recall, f1)

In [51]:
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_concepts import CTsToConceptExtraction

from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_relations import CTsToRelationExtraction


# concept extraction component
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

relations = [
    "described",
    "explaining",
    "appears",
    "leads",
    "resulting",
    "changed",
    "produced",
    "drying",
    "caused",
    "affect",
    "appearing",
    "lies",
    "distributed",
    "accompanied",
    "showing",
    "pressed",
    "occurred",
    "circumvented",
    "detected",
    "tracked",
    "results",
    "like",
    "mainly",
    "uncoiling"
]

ct_concept_label = { concept : {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)

concept_extraction = CTsToConceptExtraction(
)
# concept extraction component



relation_extraction = CTsToRelationExtraction()
pipelines = []
pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            manuel_concept_extraction,
            concept_extraction,
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)



In [52]:
current_pipeline = pipelines[-1]
current_pipeline.run()

get_concept_ratio(current_pipeline, expected_concepts)

(0.8, 0.45714285714285713, 0.5818181818181818)

In [53]:
def debug_get_concept_ratio(pipeline : Pipeline, expected_concepts : List[Concept], comparator = hg_lm_similaritiry, comparator_args:dict={}) -> tuple:
    """
    Calculate the ratio of expected and unexpected concepts in a given pipeline.

    Parameters
    ----------
    pipeline : Pipeline
        The pipeline object containing concepts.
    expected_concepts : List[Concept]
        A list of expected concepts.kwargs

    Returns
    -------
    Tuple[float, float]: A tuple containing:
        The percentage of expected concepts found in the pipeline.
        The percentage of unexpected concepts in the pipeline.
    """
    
    
    found_concepts = pipeline.kr.concepts
    found_concepts = [found_concept.label for found_concept in found_concepts]
    expected_concepts = [expected_concept.label for expected_concept in expected_concepts]
    expected_concept_occ = 0
    for expected_concept in expected_concepts:
        print()
        print(f"{expected_concept} : ", end="")
        for found_concept in found_concepts:
            if comparator(expected_concept, found_concept, **comparator_args):
                print(f"{found_concept} ", end="")
                expected_concept_occ += 1
                break 

  
    precision = expected_concept_occ/len(expected_concepts)
    recall = expected_concept_occ/len(found_concepts)
    f1 = 2*(precision * recall)/(precision+recall)
    return (precision, recall, f1)



current_pipeline = pipelines[-1]
debug_get_concept_ratio(current_pipeline, expected_concepts)


Punching : punching 
Welding line : welding line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : waist folding 
metal surface defect : 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : 
pressure : pressure 
work roll damage : 
tension roll damage : tension roll 
local yield : 
low-carbon : low-carbon 

(0.8, 0.45714285714285713, 0.5818181818181818)

In [54]:
debug_get_concept_ratio(current_pipeline, expected_concepts, comparator_args={"threshold":.7})


Punching : punching 
Welding line : welding line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : fold 
metal surface defect : metal surface 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : 
pressure : pressure 
work roll damage : work roll 
tension roll damage : tension roll 
local yield : 
low-carbon : low-carbon 

(0.9, 0.5142857142857142, 0.6545454545454545)

In [55]:
debug_get_concept_ratio(current_pipeline, expected_concepts, comparator=is_equal)



Punching : punching 
Welding line : welding line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : oil spot 
Silk spot : silk spot 
Inclusion : inclusion 
Rolled pit : rolled pit 
Crease : crease 
Waist folding : waist folding 
metal surface defect : 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : 
pressure : pressure 
work roll damage : 
tension roll damage : 
local yield : 
low-carbon : low-carbon 

(0.75, 0.42857142857142855, 0.5454545454545454)

In [56]:
debug_get_concept_ratio(current_pipeline, expected_concepts, comparator_args={"threshold":.5})



Punching : punching 
Welding line : welding line 
Crescent Gap : crescent gap 
Water spot : water spot 
Oil spot : water spot 
Silk spot : water spot 
Inclusion : inclusion 
Rolled pit : pits 
Crease : crease 
Waist folding : fold 
metal surface defect : metal surface 
mechanical failure : mechanical failure 
drying : drying 
mechanical lubricant : mechanical lubricant 
temperature : 
pressure : pressure 
work roll damage : damage 
tension roll damage : tension roll 
local yield : 
low-carbon : low-carbon 

(0.9, 0.5142857142857142, 0.6545454545454545)

## Optimze the similarity threshold


In [57]:
print(get_concept_ratio(current_pipeline, expected_concepts)) # default threshold is 0.8
print(get_concept_ratio(current_pipeline, expected_concepts, comparator_args={"threshold": 0.7}))
print(get_concept_ratio(current_pipeline, expected_concepts, comparator_args={"threshold": 0.6}))
print(get_concept_ratio(current_pipeline, expected_concepts, comparator_args={"threshold": 0.5}))

comparator_args={"threshold": 0.7}

(0.8, 0.45714285714285713, 0.5818181818181818)
(0.9, 0.5142857142857142, 0.6545454545454545)
(0.9, 0.5142857142857142, 0.6545454545454545)
(0.9, 0.5142857142857142, 0.6545454545454545)


#  a grid search algorithm for pipeline

In [58]:
class GridSearch:
    def __init__(self) -> None:
        pass

# Usefull function

In [59]:
def display_concept(kr: KnowledgeRepresentation) -> None:
    print("Concepts in KR:")
    for concept in kr.concepts:
        print(concept.label)


def display_relation(kr: KnowledgeRepresentation) -> None:
    print("Relations in KR:")
    for relation in kr.relations:
        if (
            relation.source_concept is not None
            or relation.destination_concept is not None
        ):
            print(
                (
                    relation.source_concept.label,
                    relation.label,
                    relation.destination_concept.label,
                )
            )

# Creating pipelines

In [67]:
from olaf.pipeline.pipeline_component.term_extraction import (
    ManualCandidateTermExtraction,
    POSTermExtraction,
    TFIDFTermExtraction,
    CvalueTermExtraction
)

from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringConceptExtraction
)


## Manuelle Concept Extraction

In [74]:
pipelines = []

### Manuelle Concept Extraction and Candidat To Concept Extraction

In [75]:
# concept extraction component
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

ct_concept_label = { concept : {concept} for concept in concepts}


pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            CTsToConceptExtraction(),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)
current_pipeline = pipelines[-1]
current_pipeline.run()


get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)

(0.9, 0.5142857142857142, 0.6545454545454545)

### Manuelle Concept Extraction and Synonym Concept Extraction

In [76]:

pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            SynonymConceptExtraction(),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)
current_pipeline = pipelines[-1]
current_pipeline.run()


get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)

(0.9, 0.5142857142857142, 0.6545454545454545)

### Manuelle Concept Extraction and Agglomerative Clustering Concept Extraction

In [77]:

pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(
                ct_label_strings_map=ct_concept_label
            ),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.3
            ),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)
current_pipeline = pipelines[-1]
current_pipeline.run()


get_concept_ratio(current_pipeline, expected_concepts, comparator_args=comparator_args)

Downloading modules.json: 100%|██████████| 349/349 [00:00<00:00, 442kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 165kB/s]
Downloading README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 4.13MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 92.7kB/s]
Downloading config.json: 100%|██████████| 571/571 [00:00<00:00, 798kB/s]
Downloading model.safetensors: 100%|██████████| 438M/438M [00:39<00:00, 11.2MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 365kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.31MB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.75MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 306kB/s]
Downloading 1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 217kB/s]


(0.9, 0.5454545454545454, 0.679245283018868)

## POS tag Concept Extraction

In [65]:

pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            POSTermExtraction(
                pos_selection=["noun"]
            ),
            CTsToConceptExtraction()
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

               By default the system will use the entire content of the document.]


In [66]:
current_pipeline = pipelines[-1]
current_pipeline.run()

