In [1]:
import pandas as pd
import sys, os
sys.path.append('..')

from src.utils import load_config_file, load_logger


## 1. Some example to use it in for morpho-synthax annotation and pos tagging

### 1.1 Use it with spacy models

model_name, correspond au nom des modèles dans spacy : 
- "spacy_core_web_sm", "spacy_core_web_md", "spacy_core_web_lg", "spacy_core_web_trf"



In [5]:
#load the pipeline object
from src.pipeline_spacy import SpacyNlpPipeline
spacy_pipeline = SpacyNlpPipeline(model_name='fr_dep_news_trf', use_gpu = True)


In [None]:
# apply it to a simple example of text and get the structure results
text = " je suis un exemple de text très simple"
result = spacy_pipeline.nlp(text)
result

### 1.2 use it with stanza


In [3]:
from src.pipeline_stanza import StanzaNlpPipeline
stanza_pipeline = StanzaNlpPipeline(lang = "fr",stop_word = ['*'], use_gpu=True)

2023-05-12 14:39:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 7.71MB/s]                    
2023-05-12 14:39:09 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-05-12 14:39:10 INFO: Using device: cuda
2023-05-12 14:39:10 INFO: Loading: tokenize
2023-05-12 14:39:11 INFO: Loading: mwt
2023-05-12 14:39:11 INFO: Loading: pos
2023-05-12 14:39:12 INFO: Loading: lemma
2023-05-12 14:39:12 INFO: Done loading processors!


In [4]:
text = " je suis un exemple de text très simple"
result = stanza_pipeline.nlp(text)
result

{'token': ['je', 'suis', 'un', 'exemple', 'de', 'text', 'très', 'simple'],
 'lemma': ['il', 'être', 'un', 'exemple', 'de', 'text', 'très', 'simple'],
 'pos': [('je', 'PRON'),
  ('suis', 'AUX'),
  ('un', 'DET'),
  ('exemple', 'NOUN'),
  ('de', 'ADP'),
  ('text', 'NOUN'),
  ('très', 'ADV'),
  ('simple', 'ADJ')],
 'xpos': [('je', None),
  ('suis', None),
  ('un', None),
  ('exemple', None),
  ('de', None),
  ('text', None),
  ('très', None),
  ('simple', None)],
 'morph': [('je', 'PRON', 'Number=Sing|Person=1|PronType=Prs'),
  ('suis', 'AUX', 'Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin'),
  ('un', 'DET', 'Definite=Ind|Gender=Masc|Number=Sing|PronType=Art'),
  ('exemple', 'NOUN', 'Gender=Masc|Number=Sing'),
  False,
  ('text', 'NOUN', 'Gender=Masc|Number=Sing'),
  False,
  ('simple', 'ADJ', 'Gender=Masc|Number=Sing')],
 'doc': [
   [
     {
       "id": 1,
       "text": "je",
       "lemma": "il",
       "upos": "PRON",
       "feats": "Number=Sing|Person=1|PronType=Prs",
     

### 1.3 Use it with stanza CoreNLP

Attention, il faut avoir lancé le serveur CoreNLP avant d'utiliser cette fonction (cf. README.md)

In [3]:
from src.pipeline_stanza_corenlp import StanzaCoreNlpPipeline
# corenlp_pipeline = StanzaCoreNlpPipeline(lang = "fr",stop_word = ['*'])


### 1.4 Use it with transformers

This function make easy to use any model from hugging face hub.



In [5]:
from src.pipeline_transformers import TransformersNlpPipeline
transformers_pipeline = TransformersNlpPipeline(model_name = 'gilf/french-camembert-postag-model', stop_word = ['*'], use_gpu=False)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
text = " je suis un exemple de text très simple"
result = transformers_pipeline.nlp(text)
result

{'token': ['je', 'suis', 'un', 'exemple', 'de', 'text', 'très', 'simple'],
 'lemma': ['je', 'être', 'un', 'exemple', 'de', 'text', 'très', 'simple'],
 'pos': [('je', 'PRON'),
  ('suis', 'VERB'),
  ('un', 'DET'),
  ('exemple', 'NOUN'),
  ('de', 'ADP'),
  ('text', 'NOUN'),
  ('très', 'ADJ'),
  ('simple', 'ADJ')],
 'xpos': ['CLS', 'V', 'DET', 'NC', 'P', 'NC', 'ADV', 'ADJ'],
 'morph': [('je', 'PRON', False),
  ('suis', 'VERB', False),
  ('un', 'DET', False),
  ('exemple', 'NOUN', False),
  ('de', 'ADP', False),
  ('text', 'NOUN', False),
  ('très', 'ADJ', False),
  ('simple', 'ADJ', False)]}

## 2. Train embeddings on your data

In [2]:
from src.pipeline_embeddings import BuildEmbeddings

config = load_config_file("../config.yaml")
builder = BuildEmbeddings(config=config)


In [3]:
test = [['exemple', 'de', 'text', 'a'],['exemple', 'de', 'text', 'b'],['exemple', 'de', 'text', 'c']]

builder.train_and_save(corpus = test, method='word2vec',model_name = 'test')
model,_,_ = builder.load_model(model_name='test',method='word2vec')

print(model.wv.most_similar('exemple'))

fail to save the config file because of [Errno 2] No such file or directory: '/home/robin/Code_repo/nlp_pipeline/notebooks/config.yaml'


[('de', -0.023671666160225868), ('text', -0.05234673619270325)]
