In this notebook we aim to develop a protocole to evaluate OLAF pipelines:


To achieve this task , we will follow this steps:

- Select a corpus.
- Select and create relevent concepts from the corpus.
- Create several pipelines with different components and parameters.
- Run all the pipelines.
- Find concepts involved in complete triples (relation with no null source and destination concepts) for each pipeline.
- Etablish the matching percentage of found concepts compared to selected concepts on step 2.


In [2]:
import spacy
from typing import Set, List

from olaf import Pipeline
from olaf.data_container.relation_schema import Relation, Concept
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    CTsToRelationExtraction,
    SynonymRelationExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringRelationExtraction,
    AgglomerativeClusteringConceptExtraction
)
from olaf.pipeline.pipeline_component.term_extraction.tfidf_term_extraction import (
    TFIDFTermExtraction,
)
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nlp = spacy.load("en_core_web_lg")

# Select Corpus

In [4]:
corpus_path = "GC10-DET_doc.txt"
corpus = TextCorpusLoader(corpus_path)._read_corpus()
corpus = [doc[:-1] for doc in corpus]
corpus

['Each defect type is described in detail, explaining how it appears on the steel strip surface and the reasons behind its occurrence:',
 '    Punching: In the production line of the strip, the steel strip needs to be punched according to the product specifications; mechanical failure may lead to unwanted punching, resulting in punching defects.',
 '    Welding line: When the strip is changed, it is necessary to weld the two coils of the strip, and the weld line is produced. Strictly speaking, this is not a defect, but it needs to be automatically detected and tracked to be circumvented in subsequent cuts.',
 '    Crescent gap: In the production of steel strip, cutting sometimes results in defects, just like half a circle.',
 '    Water spot: A water spot is produced by drying in production. Under different products and processes, the requirements for this defect are different. However, because the water spots are generally with low contrast, and are similar to other defects such as oi

# Select and create relevent concepts from the corpus.


In [30]:
expected_concepts = []
with open("concepts.txt", 'r') as f:
    lines = f.readlines()
    expected_concepts = [concept[:-1] for concept in lines]
    expected_concepts = [Concept(concept) for concept in expected_concepts]
    f.close()

print(expected_concepts)


[Punching, Welding line, Crescent Gap, Water spot, Oil spot, Silk spot, Inclusion, Rolled pit, Crease, Waist folding, metal surface defect, mechanical failure, drying, mechanical lubricant, temperature, pressure, work roll damage, tension roll damage, local yield, low-carbo]


In [28]:
def get_concept_ratio(pipeline : Pipeline, excepted_concepts : List[Concept]) -> tuple:
    """
    Calculate the ratio of expected and unexpected concepts in a given pipeline.

    Parameters
    ----------
    pipeline : Pipeline
        The pipeline object containing concepts.
    expected_concepts : List[Concept]
        A list of expected concepts.

    Returns
    -------
    Tuple[float, float]: A tuple containing:
        The percentage of expected concepts found in the pipeline.
        The percentage of unexpected concepts in the pipeline.
    """
    concepts = pipeline.kr.concepts
    expected_concept_occ = 0
    for expected_concept in expected_concepts:
        if any([expected_concept.label == concept.label for concept in concepts]):
            expected_concept_occ += 1

    unexpected_concept_occ = (len(concepts) - expected_concept_occ)

    return (
        expected_concept_occ*100/len(expected_concepts), 
        unexpected_concept_occ*100/len(concepts)
    )

# Create several pipelines with different components and parameters.


In [7]:
pipelines = []

## Manuel cancidat extraction pipeline


In [8]:
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_concepts import CTsToConceptExtraction

from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_relations import CTsToRelationExtraction


# concept extraction component
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

relations = [
    "described",
    "explaining",
    "appears",
    "leads",
    "resulting",
    "changed",
    "produced",
    "drying",
    "caused",
    "affect",
    "appearing",
    "lies",
    "distributed",
    "accompanied",
    "showing",
    "pressed",
    "occurred",
    "circumvented",
    "detected",
    "tracked",
    "results",
    "like",
    "mainly",
    "uncoiling"
]

ct_concept_label = { concept : {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)

concept_extraction = CTsToConceptExtraction(
)
# concept extraction component

ct_relation_label = { relation : {relation} for relation in relations}

manuel_relation_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_relation_label
)

relation_extraction = CTsToRelationExtraction(
)

pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            manuel_concept_extraction,
            concept_extraction,
            manuel_relation_extraction,
            relation_extraction,
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)



In [26]:
current_pipeline = pipelines[0]
current_pipeline.run()

concepts = current_pipeline.kr.concepts

expected_concept_occ = sum([ 
        any([expected_concept.label == concept.label for concept in concepts])
        for  expected_concept in expected_concepts
    ])

unexpected_concept_occ = len(concepts) - expected_concept_occ

print(expected_concept_occ*100/len(expected_concepts))
print(unexpected_concept_occ*100/len(concepts))

20.0
98.96103896103897


In [31]:
get_concept_ratio(current_pipeline, expected_concepts)

(20.0, 98.96103896103897)

## Manuel concepts extraction ; TFIDF relation extraction

In [10]:
from typing import Set
from spacy.matcher import Matcher
from olaf.data_container.relation_schema import Relation
from olaf.pipeline.pipeline_component.term_extraction.tfidf_term_extraction import (
    TFIDFTermExtraction,
)

pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            manuel_concept_extraction,
            concept_extraction,
            manuel_relation_extraction,
            TFIDFTermExtraction(max_term_token_length=3),
            relation_extraction,
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

                By default the system will use the entire content of the document.]


## Manuel concepts Extraction ; TFIDF relation exttraction , spacy match postprocessing function

In [11]:

def relation_postprocessor(relations : Set[Relation], nlp=nlp) -> Set[Relation]:
    correct_relations = set()
    relation_patterns = [
        [{"POS": "AUX", "DEP": "ROOT"}],
        [{"POS": "AUX", "OP": "?"}, {"POS": "ADV", "OP": "?"},{"POS": "VERB"}, {"POS": "ADP", "OP": "?"}],
        [{"POS": "AUX"}, {"POS": "ADJ", "OP": "+"}, {"POS": "ADP"}],
        [{"POS": "AUX"}, {"POS": "VERB", "OP": "+"}, {"POS": "ADP", "OP": "?"}],
        ]
    matcher = Matcher(nlp.vocab)

    matcher.add("REALTION_PATTERN", relation_patterns)

    for relation in relations:
        relation_doc = nlp(relation.label)
        matches = matcher(relation_doc)
        if any(
            len(relation_doc[start_idx:end_idx]) == len(relation_doc)
            for _, start_idx, end_idx in matches
        ):
            correct_relations.add(relation)
    
    return correct_relations


pipelines.append(
        Pipeline(
        spacy_model=spacy.load("en_core_web_sm"),
        pipeline_components=[
            ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
            concept_extraction,
            TFIDFTermExtraction(max_term_token_length=4, cts_post_processing_functions=[relation_postprocessor]),
            relation_extraction,
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

                By default the system will use the entire content of the document.]


## Manuel concepts Extraction ; TFIDF relation exttraction , spacy match postprocessing function, synonyms relation Enrichment

In [12]:
pipelines.append(
    Pipeline(
        spacy_model=spacy.load("en_core_web_lg"),
        pipeline_components=[
            ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
            CTsToConceptExtraction(),
            TFIDFTermExtraction(
                max_term_token_length=4,
                cts_post_processing_functions=[relation_postprocessor],
            ),
            SynonymRelationExtraction(),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

                By default the system will use the entire content of the document.]


## Manuel concepts Extraction ; TFIDF relation exttraction , spacy match postprocessing function, Agglomerative clustering relation Enrichment

In [13]:
pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
            CTsToConceptExtraction(),
            TFIDFTermExtraction(
                max_term_token_length=4,
                cts_post_processing_functions=[relation_postprocessor],
            ),
            AgglomerativeClusteringRelationExtraction(
                distance_threshold=1
            ),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

                By default the system will use the entire content of the document.]


## Manuel concepts Extraction, Agglomerative clustering concept Enrichment ; TFIDF relation exttraction , spacy match postprocessing function, Agglomerative clustering relation Enrichment

In [14]:
pipelines.append(
    Pipeline(
        spacy_model=nlp,
        pipeline_components=[
            ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
            AgglomerativeClusteringConceptExtraction(
                distance_threshold=.1
                
            ),
            TFIDFTermExtraction(
                max_term_token_length=4,
                cts_post_processing_functions=[relation_postprocessor],
            ),
            AgglomerativeClusteringRelationExtraction(
                distance_threshold=.5
            ),
        ],
        corpus=list(nlp.pipe(corpus)),
    )
)

                By default the system will use the entire content of the document.]


# Run all the pipelines.