# OLAF : creating a simple pipeline from a printer technical documentation

In [22]:
import spacy

## Import all necessary items from the olaf package

In [23]:
from olaf import Pipeline
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import CTsToConceptExtraction, CTsToRelationExtraction
from olaf.pipeline.pipeline_component.axiom_extraction.owl_axiom_extraction import OWLAxiomExtraction
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.repository.serialiser import BaseOWLSerialiser
from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

## Load the spacy language model according to the corpus

In [24]:
# installation of french spacy model
!spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [25]:
# installation of french spacy model
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.7.0
    Uninstalling en-core-web-sm-3.7.0:
      Successfully uninstalled en-core-web-sm-3.7.0
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [26]:

# spacy_sm_model = spacy.load("fr_core_news_sm")  # load a small size french model
# spacy_md_model = spacy.load("fr_core_news_md")  # load a meduim size french model
spacy_sm_model = spacy.load("en_core_web_sm")

In [27]:
def display_concept(kr : KnowledgeRepresentation) -> None:
    print("Concepts in KR:")
    for concept in kr.concepts:
        print(concept.label)

def display_relation(kr : KnowledgeRepresentation) -> None:
    print("Relations in KR:")
    for relation in kr.relations:
        if (
            relation.source_concept is not None
            or relation.destination_concept is not None
        ):
            print((relation.source_concept.label, relation.label, relation.destination_concept.label))

## Load the corpus

In [28]:
corpus_path = 'GC10-DET_doc.txt'
corpus = TextCorpusLoader(corpus_path)._read_corpus()
corpus = [ doc[:-1] for doc in corpus]
corpus

['Each defect type is described in detail, explaining how it appears on the steel strip surface and the reasons behind its occurrence:',
 '    Punching: In the production line of the strip, the steel strip needs to be punched according to the product specifications; mechanical failure may lead to unwanted punching, resulting in punching defects.',
 '    Welding line: When the strip is changed, it is necessary to weld the two coils of the strip, and the weld line is produced. Strictly speaking, this is not a defect, but it needs to be automatically detected and tracked to be circumvented in subsequent cuts.',
 '    Crescent gap: In the production of steel strip, cutting sometimes results in defects, just like half a circle.',
 '    Water spot: A water spot is produced by drying in production. Under different products and processes, the requirements for this defect are different. However, because the water spots are generally with low contrast, and are similar to other defects such as oi

### Concept Extraction

In [29]:

# POS tags to select concepts in the corpus
# here used nouns as concept
concept_pos_selection= ["NOUN"]

# Setting the parameters of the component
my_term_extract_concept = POSTermExtraction(pos_selection=concept_pos_selection)

# Concept extraction based on validation of candidate terms found above.
my_concept_extraction = CTsToConceptExtraction()

               By default the system will use the entire content of the document.]


### Relation Extraction

In [30]:
# POS tags to select relation in the corpus
# here used verbs as relation
relation_pos_selection= ["VERB"]

# Setting the parameters of the component
my_term_extract_relation = POSTermExtraction(pos_selection=relation_pos_selection)

# relation extraction based on validation of candidate terms found above.
my_relation_extraction = CTsToRelationExtraction(concept_max_distance=3)

               By default the system will use the entire content of the document.]


## Manuel Candidat Extraction Pipeline

In [31]:
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_concepts import CTsToConceptExtraction

from olaf.pipeline.pipeline_component.concept_relation_extraction.candidate_terms_to_relations import CTsToRelationExtraction


# concept extraction component
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon"
]

relations = [
    "described",
    "explaining",
    "appears",
    "leads",
    "resulting",
    "changed",
    "produced",
    "drying",
    "caused",
    "affect",
    "appearing",
    "lies",
    "distributed",
    "accompanied",
    "showing",
    "pressed",
    "occurred",
    "circumvented",
    "detected",
    "tracked",
    "results",
    "like",
    "mainly",
    "uncoiling"
]

ct_concept_label = { concept : {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)

concept_extraction = CTsToConceptExtraction(
)
# concept extraction component

ct_relation_label = { relation : {relation} for relation in relations}

manuel_relation_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_relation_label
)

relation_extraction = CTsToRelationExtraction(
)





In [32]:
olaf_pipeline = Pipeline(
    spacy_model=spacy_sm_model,
    pipeline_components=[
        manuel_concept_extraction,
        concept_extraction,
        manuel_relation_extraction,
        relation_extraction,
    ],
    corpus=list(spacy_sm_model.pipe(corpus)),
)

olaf_pipeline.run()

In [33]:
display_concept(olaf_pipeline.kr)

Concepts in KR:
roller
water spot
damage
oil spot
mechanical failure
plaque
uncoiling process
defect type
bulges
crease
steel strip surface
deformation
pressure
welding line
strip surface
crescent gap
block irregular distribution
fish scale shape
steel plate
cutting
silk spot
waist folding
inclusion
fold
drying
low-carbon
work roll
pits
metal surface
mechanical lubricant
weld line
rolled pit
spots
punching
tension roll


In [34]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

distributed
like
resulting
showing
pressed
lies
produced
drying
explaining
mainly
circumvented
detected
results
accompanied
uncoiling
changed
tracked
described
appears
affect
caused
produced


## TFIDF

In [35]:
from typing import Set
from spacy.matcher import Matcher
from olaf.data_container.relation_schema import Relation

In [45]:

def relation_postprocessor(relations : Set[Relation], nlp=spacy.load("en_core_web_sm")) -> Set[Relation]:
    correct_relations = set()
    relation_patterns = [
        [{"POS": "AUX", "DEP": "ROOT"}],
        [{"POS": "AUX", "OP": "?"}, {"POS": "ADV", "OP": "?"},{"POS": "VERB"}, {"POS": "ADP", "OP": "?"}],
        [{"POS": "AUX"}, {"POS": "ADJ", "OP": "+"}, {"POS": "ADP"}],
        [{"POS": "AUX"}, {"POS": "VERB", "OP": "+"}, {"POS": "ADP", "OP": "?"}],
        ]
    matcher = Matcher(nlp.vocab)

    matcher.add("REALTION_PATTERN", relation_patterns)

    for relation in relations:
        relation_doc = nlp(relation.label)
        matches = matcher(relation_doc)
        if any(
            len(relation_doc[start_idx:end_idx]) == len(relation_doc)
            for _, start_idx, end_idx in matches
        ):
            correct_relations.add(relation)
    
    return correct_relations

In [46]:
from olaf.pipeline.pipeline_component.term_extraction.tfidf_term_extraction import (
    TFIDFTermExtraction,
)

relation_term_extraction = TFIDFTermExtraction(max_term_token_length=3, cts_post_processing_functions=[relation_postprocessor])

olaf_pipeline = Pipeline(
    spacy_model=spacy_sm_model,
    pipeline_components=[
        manuel_concept_extraction,
        concept_extraction,
        manuel_relation_extraction,
        relation_term_extraction,
        relation_extraction,
    ],
    corpus=list(spacy_sm_model.pipe(corpus)),
)

olaf_pipeline.run()

                By default the system will use the entire content of the document.]


In [38]:
display_concept(olaf_pipeline.kr)

Concepts in KR:
rolled pit
cutting
punching
pits
fish scale shape
deformation
metal surface
mechanical failure
work roll
spots
steel strip surface
bulges
oil spot
strip surface
weld line
block irregular distribution
mechanical lubricant
pressure
roller
low-carbon
waist folding
damage
tension roll
defect type
inclusion
uncoiling process
crescent gap
steel plate
drying
fold
silk spot
plaque
welding line
water spot
crease


In [47]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

is
defect
caused by
are pressed
appears on
described in
speaking
according
indicating
explaining
may appear
will
uncoiling
caused
is described
punched
is due to
needs
lead
roll
spacing across
pressed
are distributed throughout
lies in
automatically detected
sometimes results
roll
detected
according to
rolled
like
fall
are
punctate
caused by
is changed
be punched according
described
appears
fall off
folds
distributed throughout
drying in
often accompanied
usually caused by
lead to
be circumvented in
are usually detected
often accompanied by
produced
is usually caused
detected by
appear on
may lead to
changed
strictly speaking
transverse
be
are
usually detected
distributed
mainly
welding
appears
tracked
are
showing
is produced
may
affect
appear
spacing
is
resulting in
accompanied
caused
lies
produced
folds in
block
distributed
resulting
drying
usually showing
pockmarked
may lead
results
will affect
produced
produced by
rolled
weld
fold
tracked to
explaining
moving
produced
sometimes resu

In [48]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('crease', 'is', 'fold')
('inclusion', 'defect', 'metal surface')
('mechanical failure', 'lead', 'punching')
('work roll', 'roll', 'damage')
('work roll', 'roll', 'tension roll')
('rolled pit', 'rolled', 'pits')
('rolled pit', 'are', 'bulges')
('oil spot', 'caused by', 'mechanical lubricant')
('oil spot', 'usually caused by', 'mechanical lubricant')
('mechanical failure', 'lead to', 'punching')
('water spot', 'produced', 'drying')
('mechanical failure', 'may lead to', 'punching')
('crease', 'transverse', 'fold')
('pits', 'are', 'pits')
('metal surface', 'showing', 'spots')
('mechanical failure', 'may', 'punching')
('punching', 'resulting in', 'punching')
('punching', 'resulting', 'punching')
('metal surface', 'usually showing', 'spots')
('mechanical failure', 'may lead', 'punching')
('water spot', 'produced', 'drying')
('water spot', 'produced by', 'drying')
('rolled pit', 'rolled', 'bulges')
('punching', 'resulting', 'punching')
('metal surface', 'showing', 'spots')
(

In [41]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

roll
pressed into
sometimes results
is uneven in
drying
is often accompanied
circumvented
tracked
detected
lead
pressed
spacing
produced by
are distributed throughout
needs
appear
according
are
caused by
may
punching
produced
pockmarked
are pressed
circumvented in
is
according to
distributed
explaining
are
be automatically detected
accompanied
caused by
appears
be
caused
block
accompanied
drying
appear on
showing
uncoiling
results
usually detected by
is produced
mainly caused
caused
mainly caused by
often accompanied by
usually caused by
affect
like
transverse
are usually detected
drying in
automatically detected
mistake
is produced
spacing across
may appear on
explaining
are
is described
rolled
will affect
are distributed
usually showing
defect
lead to
often accompanied
is
roll
fall off
folds
welding
distributed throughout
accompanied by
indicating
folds in
produced
tracked
circumvented
are pressed into
distributed
may
is changed
punched
are
usually detected
pressed
defect
appears on


## Let's find how to postprocess relation after TFIDF

In [42]:
doc = spacy_sm_model("")
doc



In [43]:
text = "the faillure is caused by rolled pits"

nlp = spacy.load("en_core_web_lg")

doc = nlp(text)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head)

the DET det faillure
faillure NOUN nsubjpass caused
is AUX auxpass caused
caused VERB ROOT caused
by ADP agent caused
rolled VERB amod pits
pits NOUN pobj by


In [49]:
text = "Rolled pits are periodic bulges"

nlp = spacy.load("en_core_web_lg")

doc = nlp(" ".join(olaf_pipeline.kr.concepts))
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head)


TypeError: sequence item 0: expected str instance, Concept found

In [50]:
olaf_pipeline.kr.concepts

{<olaf.data_container.concept_schema.Concept at 0x781cffff4130>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff41c0>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4460>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4670>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff46d0>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4790>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4ac0>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4d90>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4ee0>,
 <olaf.data_container.concept_schema.Concept at 0x781cffff4fa0>,
 <olaf.data_container.concept_schema.Concept at 0x781d00067490>,
 <olaf.data_container.concept_schema.Concept at 0x781d0ce0c040>,
 <olaf.data_container.concept_schema.Concept at 0x781d0ce0c280>,
 <olaf.data_container.concept_schema.Concept at 0x781d0ce0dc30>,
 <olaf.data_container.concept_schema.Concept at 0x781d0dac6470>,
 <olaf.data_container.con