In [1]:
import spacy
from typing import Set

from olaf import Pipeline
from olaf.data_container.relation_schema import Relation
from olaf.data_container.knowledge_representation_schema import KnowledgeRepresentation
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    CTsToConceptExtraction,
    CTsToRelationExtraction,
    SynonymRelationExtraction,
    SynonymConceptExtraction,
    AgglomerativeClusteringRelationExtraction,
    AgglomerativeClusteringConceptExtraction
)
from olaf.pipeline.pipeline_component.term_extraction.tfidf_term_extraction import (
    TFIDFTermExtraction,
)
from olaf.pipeline.pipeline_component.term_extraction.manual_candidate_terms import (
    ManualCandidateTermExtraction,
)
from olaf.repository.corpus_loader.text_corpus_loader import TextCorpusLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
corpus_path = "GC10-DET_doc.txt"
corpus = TextCorpusLoader(corpus_path)._read_corpus()
corpus = [doc[:-1] for doc in corpus]
# corpus = "".join(corpus)
corpus

['Each defect type is described in detail, explaining how it appears on the steel strip surface and the reasons behind its occurrence:',
 '    Punching: In the production line of the strip, the steel strip needs to be punched according to the product specifications; mechanical failure may lead to unwanted punching, resulting in punching defects.',
 '    Welding line: When the strip is changed, it is necessary to weld the two coils of the strip, and the weld line is produced. Strictly speaking, this is not a defect, but it needs to be automatically detected and tracked to be circumvented in subsequent cuts.',
 '    Crescent gap: In the production of steel strip, cutting sometimes results in defects, just like half a circle.',
 '    Water spot: A water spot is produced by drying in production. Under different products and processes, the requirements for this defect are different. However, because the water spots are generally with low contrast, and are similar to other defects such as oi

In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
concepts = [
    "defect type",
    "steel strip surface",
    "punching",
    "mechanical failure",
    "welding line",
    "coil",
    "weld line",
    "crescent gap",
    "cutting",
    "water spot",
    "drying",
    "oil spot",
    "mechanical lubricant",
    "silk spot",
    "plaque",
    "strip surface",
    "roller",
    "pressure",
    "temperature",
    "inclusion",
    "metal surface",
    "spots",
    "fish scale shape",
    "block irregular distribution",
    "rolled pit",
    "bulges",
    "pits",
    "steel plate",
    "work roll",
    "tension roll",
    "damage",
    "reason",
    "crease",
    "fold",
    "uncoiling process",
    "waist folding",
    "deformation",
    "low-carbon",
]

relations = [
    "described",
    "explaining",
    "appears",
    "leads",
    "resulting",
    "changed",
    "produced",
    "drying",
    "caused",
    "affect",
    "appearing",
    "lies",
    "distributed",
    "accompanied",
    "showing",
    "pressed",
    "occurred",
    "circumvented",
    "detected",
    "tracked",
    "results",
    "like",
    "mainly",
    "uncoiling",
]

In [5]:
ct_concept_label = {concept: {concept} for concept in concepts}

manuel_concept_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_concept_label
)

concept_extraction = CTsToConceptExtraction()
# concept extraction component

ct_relation_label = {relation: {relation} for relation in relations}

manuel_relation_extraction = ManualCandidateTermExtraction(
    ct_label_strings_map=ct_relation_label
)

relation_extraction = CTsToRelationExtraction(concept_max_distance=3)

In [6]:
def display_concept(kr: KnowledgeRepresentation) -> None:
    print("Concepts in KR:")
    for concept in kr.concepts:
        print(concept.label)


def display_relation(kr: KnowledgeRepresentation) -> None:
    print("Relations in KR:")
    for relation in kr.relations:
        if (
            relation.source_concept is not None
            or relation.destination_concept is not None
        ):
            print(
                (
                    relation.source_concept.label,
                    relation.label,
                    relation.destination_concept.label,
                )
            )

In [7]:
olaf_pipeline = Pipeline(
    spacy_model=nlp,
    pipeline_components=[
        manuel_concept_extraction,
        concept_extraction,
        manuel_relation_extraction,
        TFIDFTermExtraction(max_term_token_length=3),
        relation_extraction,
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()

                By default the system will use the entire content of the document.]


In [8]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('crease', 'crease is a', 'fold')
('pits', 'bulges or', 'pits')
('rolled pit', ': rolled pits', 'bulges')
('crease', 'a', 'crease')
('reason', 'lies in the', 'temperature')
('pits', 'periodic', 'bulges')
('oil spot', ':', 'oil spot')
('crease', 'vertical transverse', 'fold')
('rolled pit', 'pit : rolled', 'pits')
('reason', 'in the uneven', 'temperature')
('tension roll', 'roll', 'damage')
('water spot', ': a', 'water spot')
('crease', 'is a vertical', 'fold')
('rolled pit', 'rolled pits are', 'bulges')
('reason', 'the uneven temperature', 'roller')
('plaque', 'a', 'strip surface')
('pits', 'surface of', 'steel plate')
('pits', 'are periodic bulges', 'pits')
('plaque', 'on', 'strip surface')
('crease', 'a', 'fold')
('plaque', 'on a', 'strip surface')
('pits', 'periodic bulges or', 'pits')
('rolled pit', 'rolled', 'pits')
('temperature', 'of the roller', 'pressure')
('reason', 'to', 'low-carbon')
('metal surface', ', usually', 'spots')
('temperature', 'the roller and', 

In [9]:
relation_found = [relation.label for relation in olaf_pipeline.kr.relations]
for idx, rel in enumerate(relation_found):
    print(idx, rel)

0 local or continuous
1 detail
2 a
3 the weld
4 damage .
5 crease is a
6 needs to be
7 ) ,
8 other
9 bulges or
10 however , because
11 described
12 product specifications
13 usually caused by
14 : rolled pits
15 , or
16 a
17 lubricant
18 lies in the
19 local ) ,
20 periodic
21 :
22 the surface
23 , because the
24 defects
25 as
26 continuous wave -
27 defect type is
28 strip is changed
29 folds
30 weld line
31 local or
32 vertical transverse
33 pit : rolled
34 fall
35 steel strip
36 in the uneven
37 strip needs
38 strip
39 roll
40 is changed ,
41 contamination
42 little
43 due
44 punctate
45 defect , but
46 pits on
47 processes
48 : a
49 is a vertical
50 caused by the
51 and is
52 of a
53 the water spots
54 line is
55 continuous wave
56 a local or
57 rolled pits are
58 it is necessary
59 parts
60 the uneven temperature
61 : a crease
62 may
63 throughout the strip
64 a
65 because the water
66 , cutting
67 waist folding
68 detected
69 type is described
70 , it is
71 (
72 usually
73 uncoil

In [10]:
relation_patterns = [
    [{"POS": "AUX", "DEP": "ROOT"}],
    [{"POS": "ADV", "OP": "?"},{"POS": "VERB"}, {"POS": "ADP", "OP": "?"}],
    [{"POS": "AUX"}, {"POS": "ADJ", "OP": "+"}, {"POS": "ADP"}],
    [{"POS": "AUX"}, {"POS": "VERB", "OP": "+"}, {"POS": "ADP", "OP": "?"}],
    ]
rel_doc = nlp('Inclusion is a typical defect. water spot is produced by drying')

for token in rel_doc:
    print(token.text, token.pos_, token.dep_)

Inclusion NOUN nsubj
is AUX ROOT
a DET det
typical ADJ amod
defect NOUN attr
. PUNCT punct
water NOUN compound
spot NOUN nsubjpass
is AUX auxpass
produced VERB ROOT
by ADP prep
drying VERB pcomp


In [11]:
from spacy.matcher import Matcher

aux_pattern = [
    [{"POS": "AUX", "DEP": "ROOT"}],
    ]
matcher = Matcher(nlp.vocab)

matcher.add("AUX_PATTERN", aux_pattern)

matches = matcher(rel_doc)
for (match_id, start_idx, end_idx) in matches:
    matched_doc = rel_doc[start_idx:end_idx].as_doc()
    print(matched_doc, matched_doc[0].pos_, matched_doc[0].dep_)



is  AUX ROOT


In [12]:
matcher = Matcher(nlp.vocab)

matcher.add("REALTION_PATTERN", relation_patterns)

# for relation in relation_found:
#     matches = matcher(nlp(relation))
#     if le

In [13]:
doc = nlp('usually caused by')
matches = matcher(doc)
for (match_id, start_idx, end_idx) in matches:
    print(doc[start_idx:end_idx])


usually caused
usually caused by
caused
caused by


In [14]:


def relation_postprocessor(relations : Set[Relation], nlp=spacy.load("en_core_web_sm")) -> Set[Relation]:
    correct_relations = set()
    relation_patterns = [
        [{"POS": "AUX", "DEP": "ROOT"}],
        [{"POS": "AUX", "OP": "?"}, {"POS": "ADV", "OP": "?"},{"POS": "VERB"}, {"POS": "ADP", "OP": "?"}],
        [{"POS": "AUX"}, {"POS": "ADJ", "OP": "+"}, {"POS": "ADP"}],
        [{"POS": "AUX"}, {"POS": "VERB", "OP": "+"}, {"POS": "ADP", "OP": "?"}],
        ]
    matcher = Matcher(nlp.vocab)

    matcher.add("REALTION_PATTERN", relation_patterns)

    for relation in relations:
        relation_doc = nlp(relation.label)
        matches = matcher(relation_doc)
        if any(
            len(relation_doc[start_idx:end_idx]) == len(relation_doc)
            for _, start_idx, end_idx in matches
        ):
            correct_relations.add(relation)
    
    return correct_relations

In [15]:

olaf_pipeline = Pipeline(
    spacy_model=spacy.load("en_core_web_sm"),
    pipeline_components=[
        ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
        concept_extraction,
        TFIDFTermExtraction(max_term_token_length=4, cts_post_processing_functions=[relation_postprocessor]),
        relation_extraction,
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()

                By default the system will use the entire content of the document.]


In [16]:
display_concept(olaf_pipeline.kr)

Concepts in KR:
mechanical lubricant
block irregular distribution
plaque
uncoiling process
pits
punching
fold
low-carbon
spots
bulges
pressure
drying
welding line
steel strip surface
silk spot
crease
mechanical failure
defect type
metal surface
steel plate
tension roll
waist folding
temperature
fish scale shape
inclusion
cutting
roller
strip surface
crescent gap
rolled pit
oil spot
damage
work roll
weld line
reason
deformation
water spot


In [17]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('mechanical failure', 'lead to', 'punching')
('work roll', 'roll', 'tension roll')
('rolled pit', 'rolled', 'pits')
('punching', 'resulting', 'punching')
('water spot', 'is produced by', 'drying')
('reason', 'is due to', 'low-carbon')
('tension roll', 'roll', 'damage')
('water spot', 'produced', 'drying')
('water spot', 'produced by', 'drying')
('pits', 'are', 'bulges')
('mechanical failure', 'may lead to', 'punching')
('mechanical failure', 'lead', 'punching')
('water spot', 'is produced', 'drying')
('mechanical failure', 'may lead', 'punching')
('water spot', 'is', 'drying')
('punching', 'resulting in', 'punching')
('reason', 'lies in', 'temperature')


In [18]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

caused by
usually showing
spacing
automatically detected
lead to
may appear
described in
pressed into
is often accompanied by
appear on
may
folds in
mistake
fall off
usually detected
roll
rolled
rolled
be
indicating
are usually detected
resulting
block
pockmarked
are distributed
speaking
punched
according to
defect
is produced by
are pressed
is due to
roll
produced
be circumvented
is produced
usually detected by
detected
are pressed into
often accompanied
fold
detected by
be punched according to
produced by
is uneven in
caused
punctate
are usually detected by
uncoiling
folding
folds
is often accompanied
produced
needs
distributed throughout
explaining
is usually caused
sometimes results
often accompanied by
affect
usually caused by
are distributed throughout
punching
usually caused
appear
appears
drying
distributed
be punched according
are
is described in
be punched
be automatically detected
weld
will
appears on
are
tracked
sometimes results in
circumvented
may lead to
pressed
moving
b

# TFIDF - Enrichment par sysnonyme

In [19]:
olaf_pipeline = Pipeline(
    spacy_model=spacy.load("en_core_web_lg"),
    pipeline_components=[
        ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
        CTsToConceptExtraction(),
        TFIDFTermExtraction(
            max_term_token_length=4,
            cts_post_processing_functions=[relation_postprocessor],
        ),
        SynonymRelationExtraction(),
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()

                By default the system will use the entire content of the document.]


KeyboardInterrupt: 

In [None]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('reason', 'is', 'low-carbon')
('water spot', 'is produced', 'drying')
('rolled pit', 'are', 'bulges')
('rolled pit', 'rolled', 'pits')
('water spot', 'produced', 'drying')
('tension roll', 'roll', 'damage')
('mechanical failure', 'may lead', 'punching')
('work roll', 'roll', 'tension roll')
('crease', 'is', 'fold')
('pits', 'are', 'bulges')
('mechanical failure', 'lead to', 'punching')
('crease', 'transverse', 'fold')
('punching', 'resulting in', 'punching')
('pits', 'are', 'pits')
('reason', 'is due to', 'low-carbon')
('deformation', 'is', 'reason')
('oil spot', 'caused by', 'mechanical lubricant')
('mechanical failure', 'lead', 'punching')
('rolled pit', 'are', 'pits')
('water spot', 'produced by', 'drying')
('water spot', 'is produced by', 'drying')
('metal surface', 'showing', 'spots')
('mechanical failure', 'may lead to', 'punching')
('inclusion', 'defect', 'metal surface')
('metal surface', 'usually showing', 'spots')
('punching', 'resulting', 'punching')
('reas

In [None]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

is
folds in
accompanied by
described
mistake
is often accompanied
be punched
spacing
be punched according to
pressed into
drying in
appears
be circumvented
circumvented in
mainly caused by
often accompanied by
affect
be circumvented in
explaining
folds
tracked to
is produced
weld
is produced
are
punched
are distributed
rolled
detected
be punched according
are usually detected
automatically detected
produced
are pressed
be
roll
welding
may lead
roll
is
are pressed into
appear on
are
fall off
lead to
transverse
tracked
resulting in
punching
folding
sometimes results
are
speaking
described in
pockmarked
cutting
are
is due to
is described
punctate
is
defect
usually detected by
caused by
lead
spacing across
fall
pressed
may
are
block
produced by
is produced by
are similar to
mainly caused
appear
needs
fold
showing
be automatically detected
may lead to
distributed throughout
is uneven in
is often accompanied by
moving
defect
usually caused
usually showing
are distributed throughout
resulting

# relation : TFIDF - Enrichment par Agglomerative clustering

In [None]:

olaf_pipeline = Pipeline(
    spacy_model=spacy.load("en_core_web_sm"),
    pipeline_components=[
        ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
        CTsToConceptExtraction(),
        TFIDFTermExtraction(
            max_term_token_length=4,
            cts_post_processing_functions=[relation_postprocessor],
        ),
        AgglomerativeClusteringRelationExtraction(
            distance_threshold=1
        ),
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()

                By default the system will use the entire content of the document.]


In [None]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('tension roll', 'roll', 'damage')
('work roll', 'roll', 'damage')
('rolled pit', 'rolled', 'bulges')
('oil spot', 'is usually caused by', 'mechanical lubricant')
('reason', 'lies in', 'temperature')
('rolled pit', 'rolled', 'pits')
('mechanical failure', 'may lead', 'punching')
('water spot', 'is produced', 'drying')
('work roll', 'roll', 'tension roll')
('crease', 'transverse', 'fold')
('metal surface', 'showing', 'spots')
('reason', 'is', 'low-carbon')
('deformation', 'is', 'reason')
('pits', 'are', 'pits')
('inclusion', 'defect', 'metal surface')
('pits', 'are', 'bulges')
('punching', 'resulting', 'punching')


# concept : TFIDF - Enrichment par Aggloerative clustering

In [None]:

olaf_pipeline = Pipeline(
    spacy_model=nlp,
    pipeline_components=[
        ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
        AgglomerativeClusteringConceptExtraction(
            distance_threshold=.1
            
        ),
        TFIDFTermExtraction(
            max_term_token_length=4,
            cts_post_processing_functions=[relation_postprocessor],
        ),
        AgglomerativeClusteringRelationExtraction(
            distance_threshold=.5
        ),
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()
print(len(olaf_pipeline.kr.concepts))

                By default the system will use the entire content of the document.]


37


In [None]:
display_concept(olaf_pipeline.kr)

Concepts in KR:
welding line
uncoiling process
punching
fish scale shape
mechanical failure
deformation
weld line
steel strip surface
cutting
strip surface
drying
block irregular distribution
pressure
work roll
pits
defect type
rolled pit
spots
steel plate
oil spot
tension roll
damage
water spot
mechanical lubricant
inclusion
waist folding
temperature
crescent gap
bulges
fold
reason
silk spot
metal surface
plaque
crease
roller
low-carbon


In [None]:
display_relation(olaf_pipeline.kr)

Relations in KR:
('inclusion', 'defect', 'metal surface')
('work roll', 'roll', 'tension roll')
('mechanical failure', 'may', 'punching')
('pits', 'are', 'bulges')
('reason', 'is', 'low-carbon')
('rolled pit', 'are', 'pits')
('metal surface', 'showing', 'spots')
('reason', 'is due to', 'low-carbon')
('reason', 'lies in', 'temperature')
('rolled pit', 'are', 'bulges')
('mechanical failure', 'may lead', 'punching')
('mechanical failure', 'lead', 'punching')
('oil spot', 'caused by', 'mechanical lubricant')
('pits', 'are', 'pits')
('work roll', 'roll', 'damage')
('punching', 'resulting', 'punching')
('deformation', 'is', 'reason')
('crease', 'transverse', 'fold')
('water spot', 'is', 'drying')
('crease', 'is', 'fold')
('rolled pit', 'rolled', 'pits')
('rolled pit', 'rolled', 'bulges')
('tension roll', 'roll', 'damage')
('water spot', 'produced by', 'drying')


In [None]:
for relation in olaf_pipeline.kr.relations:
    print(relation.label)

cutting
defect
defect
roll
may
punctate
is changed
speaking
are
is
sometimes results
are
are similar to
distributed
uncoiling
showing
is due to
is produced
strictly speaking
are pressed
mainly caused
circumvented
moving
lies in
are
may lead
described in
lead
caused by
according to
indicating
is
mistake
are
needs
roll
often accompanied
folds
resulting
rolled
is
transverse
block
automatically detected
is
spacing
weld
is
will affect
may
explaining
punched
rolled
rolled
roll
drying
appears on
will
be
fall off
produced by
tracked to
pockmarked


## using different scope

In [None]:

olaf_pipeline = Pipeline(
    spacy_model=nlp,
    pipeline_components=[
        ManualCandidateTermExtraction(ct_label_strings_map=ct_concept_label),
        AgglomerativeClusteringConceptExtraction(
            distance_threshold=.2
           
        ),
        TFIDFTermExtraction(
            max_term_token_length=4,
            cts_post_processing_functions=[relation_postprocessor],
        ),
        AgglomerativeClusteringRelationExtraction(
            distance_threshold=.5,
        ),
    ],
    corpus=list(nlp.pipe(corpus)),
)

olaf_pipeline.run()
print(len(olaf_pipeline.kr.concepts))

display_relation(olaf_pipeline.kr)

                By default the system will use the entire content of the document.]


36
Relations in KR:
('rolled pit', 'are', 'pits')
('mechanical failure', 'lead to', 'punching')
('reason', 'lies in', 'temperature')
('inclusion', 'defect', 'metal surface')
('water spot', 'is', 'drying')
('tension roll', 'roll', 'damage')
('crease', 'is', 'fold')
('work roll', 'roll', 'tension roll')
('mechanical failure', 'lead', 'punching')
('metal surface', 'showing', 'spots')
('work roll', 'roll', 'damage')
('mechanical failure', 'may', 'punching')
('reason', 'is due to', 'low-carbon')
('rolled pit', 'are', 'bulges')
('punching', 'resulting', 'punching')
('oil spot', 'caused by', 'mechanical lubricant')
('rolled pit', 'rolled', 'pits')
('pits', 'are', 'bulges')
('rolled pit', 'rolled', 'bulges')
('deformation', 'is', 'reason')
('crease', 'transverse', 'fold')
('water spot', 'is produced', 'drying')
('pits', 'are', 'pits')
('reason', 'is', 'low-carbon')
