# A first pipeline

## Example text

In [None]:
with open('example.txt', 'r') as f:
    text = f.read()

In [None]:
print(text)

## Defining the Spacy pipeline

In [None]:
# Importating Spacy
import spacy

In [None]:
# Loading EDS-NLP pipelines
import edsnlp.components

In [None]:
# Creates the Spacy instance
nlp = spacy.blank('fr')

# Normalisation of accents, case and other special characters
nlp.add_pipe('normalizer')
# Detecting end of lines
nlp.add_pipe('sentences')

# Extraction of named entities
nlp.add_pipe(
    'matcher',
    config=dict(
        terms=dict(respiratoire=[
            'difficultes respiratoires',
            'asthmatique',
            'toux',
        ]),
        regex=dict(
            covid=r'(?i)(?:infection\sau\s)?(covid[\s\-]?19|corona[\s\-]?virus)',
            traitement=r'(?i)traitements?|medicaments?'),
        attr='NORM',
    ),
)

# Qualification of the entities
nlp.add_pipe('negation')
nlp.add_pipe('hypothesis')
nlp.add_pipe('family')
nlp.add_pipe('rspeech')

## Using the pipeline

In [None]:
doc = nlp(text)

In [None]:
doc

---

Processing by EDS-NLP (and Spacy in general) are all non-destructive :

In [None]:
# Non-destruction
doc.text == text

For tasks such as normalization, EDS-NLP adds attributes to tokens, without information loss:

In [None]:
# Normalisation
print(f"{'text':<15}", 'normalisation')
print(f"{'----':<15}", '-------------')
for token in doc[3:15]:
    print(f"{token.text:<15}", f"{token.norm_}")

The pipeline we defined above extracted named entities using the `matcher` component.

Since we inherit from Spacy, we can use their utilities :

In [None]:
from spacy import displacy

In [None]:
displacy.render(
    doc,
    style='ent',
    options={'colors': dict(respiratoire='green', covid='orange')},
)

Let's focus on the fist entity :

In [None]:
entity = doc.ents[0]

In [None]:
entity

In [None]:
entity._.negated

We can reformat the entities to an OMOP-like format:

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_records([
    dict(
        label=ent.label_,
        start_char=ent.start_char,
        end_char=ent.end_char,
        lexical_variant=ent.text,
        negation=ent._.negated,
        family=ent._.family,
        hypothesis=ent._.hypothesis,
        rspeech=ent._.reported_speech,
    )
    for ent in doc.ents
])

In [None]:
df