### Considerações sobre o spacy

Pode ser que ocorra um erro ao executar o treinamento. Se ele ocorrer, executar a linha abaixo para atualizar o spacy.

In [None]:
!pip install --upgrade spacy

## O código a seguir realiza o treinamento do modelo

In [None]:
from __future__ import unicode_literals, print_function
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from pathlib import Path
import random
import spacy
import json

TRAIN_DATA = []

#mapeamentos = open('/content/gdrive/My Drive/modelo_medipreco/mapeamentos_medipreco.json', 'r')
mapeamentos = open('/content/gdrive/My Drive/modelo_cr/mapeamentos_cr.json', 'r')


fileContent = json.loads(mapeamentos.readline())

for line in fileContent:
  entities = []
  for ents in line["marcacoes"]:
      entities.append((ents["ini"], ents["fim"], ents["classe"]))
      print('entidade', entities)

  item = (line['fa'], {"entities": entities})
  print('item', item)
  TRAIN_DATA.append(item)
  
nlp = spacy.blank("en")  # create blank Language class
print("Created blank 'en' model")

# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
  ner = nlp.create_pipe("ner")
  nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
  ner = nlp.get_pipe("ner")
  
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):  # only train NER
  # reset and initialize the weights randomly – but only if we're
  # training a new model
  nlp.begin_training()
  
  for itn in range(100):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
      texts, annotations = zip(*batch)
      nlp.update(
        texts,  # batch of texts
        annotations,  # batch of annotations
        drop=0.5,  # dropout - make it harder to memorise data
        losses=losses
      )
#     print("Losses", losses)

def evaluate(ner_model, examples):
  scorer = Scorer()
  for input_, annot in examples:
    doc_gold_text = ner_model.make_doc(input_)
    gold = GoldParse(doc_gold_text, annot)
    pred_value = ner_model(input_)
    scorer.score(pred_value, gold)
  return scorer.scores

  
#output_dir = Path('apresentacoes_medipreco.md')
output_dir = Path('apresentacoes_cr.md')

if not output_dir.exists():
  output_dir.mkdir()

nlp.to_disk(output_dir)
print("Saved model to", output_dir)

#!cp -r apresentacoes_medipreco.md '/content/gdrive/My Drive/modelo_medipreco/'
!cp -r apresentacoes_cr.md '/content/gdrive/My Drive/modelo_cr/'

ner_model = spacy.load(output_dir)

results = evaluate(ner_model, TRAIN_DATA)

results

In [None]:
import spacy

from spacy import displacy

#nlp = spacy.load('/content/gdrive/My Drive/modelo_medipreco/apresentacoes_medipreco.md')
nlp = spacy.load('/content/gdrive/My Drive/modelo_cr/apresentacoes_cr.md')

fa_1 = 'Ems Sigma Pharma Ltda Maleato de Enalapril + Hidroclorotiazida 10 MG + 25 MG COM CT BL AL PLAS OPC X 30'

fa_2 = 'Accord Farmacêutica Ltda Bycal 50 MG COM REV CT BL AL PLAS TRANS X 30'

fa_3 = 'Brainfarma Indústria Química e Farmacêutica S.A Quadrilon 0,50 MG/G + 1 MG/G + 10 MG/G + 10 MG/G POM DERM CT BG AL X 15 G'

texto_1 = nlp(fa_1)

texto_2 = nlp(fa_2)

texto_3 = nlp(fa_3)

displacy.render(texto_1, style='ent', jupyter=True)

displacy.render(texto_2, style='ent', jupyter=True)

displacy.render(texto_3, style='ent', jupyter=True)

In [None]:
import pandas as pd
import numpy as np

tags = ['FAB', 'MED', 'CC', 'TP', 'QTD']

#apresentacoesDf = pd.read_csv('/content/gdrive/My Drive/modelo_medipreco/apresentacoes_medipreco.csv')
apresentacoesDf = pd.read_csv('/content/gdrive/My Drive/modelo_cr/apresentacoes_cr.csv')

list_apresentacoes = apresentacoesDf.apresentacao.tolist()

list_processadas = []

for apresentacao in list_apresentacoes:
  fa_nlp = nlp(apresentacao)
  
  marks = []
    
  for tag in tags:
    mark_content = [token.text for token in fa_nlp.ents if token.label_ == tag]
    marks.append(mark_content)
    
  list_processadas.append((apresentacao, *marks))


processadasDf = pd.DataFrame(list_processadas, 
                             columns=['apresentacao', 
                                      'fabricante', 
                                      'medicamento', 
                                      'concentracao', 
                                      'tipo', 
                                      'qtd'])

#processadasDf.to_csv('apresentacoes_processadas_medipreco.csv', index=False)
processadasDf.to_csv('apresentacoes_processadas_cr', index=False)

#!cp apresentacoes_processadas_medipreco.csv '/content/gdrive/My Drive/modelo_medipreco/'
!cp apresentacoes_processadas_medipreco.csv '/content/gdrive/My Drive/modelo_cr/'

# for entity in texto_1.ents:
#   print(entity.text, ' :: ' , entity.label_)