<a href="https://colab.research.google.com/github/akobiisr/NLP-TP3/blob/main/TP3_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import itertools
import warnings
import random
import time
import pandas as pd
import numpy as np
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
from spacy import displacy


In [2]:
!mkdir '/tmp/science'
!mkdir '/tmp/disease'

In [3]:
S_MODEL_PATH = '/tmp/science/model' # Chemin où sauvegarder le modèle spacy
D_MODEL_PATH = '/tmp/disease/model'
DROPOUT = 0.2

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Les fonctions

In [5]:
def load_filename(data_type):
    '''
    data_type peut prendre la valeur de disease ou science
    
    cette fonction permet de loader les fichiers en fonction du type de données
    '''
    test_filename = " "
    train_filename = " "
    valid_filename = " "

    if data_type == "science":
        test_filename = "/content/gdrive/My Drive/science/test.txt"
        train_filename = "/content/gdrive/My Drive/science/train.txt"
        valid_filename = "/content/gdrive/My Drive/science/dev.txt"

    if data_type == "disease":
        test_filename = "/content/gdrive/My Drive/disease/test.txt"
        train_filename = "/content/gdrive/My Drive/disease/train.txt"
        valid_filename = "/content/gdrive/My Drive/disease/dev.txt"
        
    return train_filename, valid_filename, test_filename

In [6]:
def load_data(filename):
  '''
  Cette fonction permet de créer un dataset compatible pour entrainer le modèle NER de Spacy
  filename: chemin des données
  '''
  sentence= list()
  labels = set()
  entities = list()
  data = list()
  with open(filename) as fn:
    content = fn.readlines()
  end = 0
  for x in content:
    x = x.strip().split('\t')
   
    if len(x) !=1:
      end += (len(x[0]) + 1)
      sentence.append(x[0])
      x[1] = x[1].replace('B-', 'I-')
      if x[1] != 'O':
        labels.add(x[1].replace('I-', ''))
        start = end - len(x[0]) - 1
        entities.append((start, end - 1, x[1].replace('I-', '')))

    else:
      sentence = " ".join(sentence)
      data.append([sentence, {'entities' : entities}])
      end = 0            
      entities, sentence = list(), list()
      
  return data, labels

In [7]:
#Cette fonction enlève les données vides
def withraw_empty_data(data):
  for d in data:
    if len(d[0]) ==0: # Verifie si la phrase est vide est la suprimer de la liste
      data.pop(data.index(d))
  return data

In [8]:
#Fonction d'entrainement un modèle de classification NER
def train_spacy(train_data, labels, iterations, dropout = DROPOUT, display_freq = 1):
    '''
    train_data: données d'entrainement au format de (sentence, {entities: [(start, end, label)]})
    issue de la fonction load_data compatible avec Spacy
    labels: Entités uniques issue du données d'entrainement
    iterations: nombre d'interations
    display_freq: nombre d'interations avant d'afficher le loss
    dropout: proportion de dropout pendant l'entrainement
    '''

    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
   
    # Ajouter les labels au modèle NER de SpaCy
    for i in labels:
        ner.add_label(i)

    # Désactiver les autres pipeline dans spacy sauf NER
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy') # Afficher les avertissement une fois
        optimizer = nlp.begin_training()

        for itr in range(iterations):
            random.shuffle(train_data) 
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

In [9]:
#Charger le modèle sauvegardé
def load_model(model_path):
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

In [15]:
# fonction d'evaluation du modele spacy
def evaluate(ner_model, test_data):
    '''
    ner_model : model sauvegardé
    test_data : données de test
    '''

    scorer = Scorer()
    for sentence, entities in test_data:
        doc_text = ner_model.make_doc(sentence)
        true_value = GoldParse(doc_text, entities=entities['entities'])
        pred_value = ner_model(sentence)
        scorer.score(pred_value, true_value)
    results = scorer.scores

    performance = results['ents_per_type']
    avg = {'f': results['ents_f'],
              'p': results['ents_p'],
              'r': results['ents_r']
              }
    performance['avg'] = avg

    df = pd.DataFrame(performance)
    print(df.transpose())

#Entrainement avec les données Disease

In [11]:
#Charger les données issue de la santé
train_filename, valid_filename, test_filename = load_filename("disease")
d_train_data, d_train_label = load_data(train_filename)
d_valid_data, _ = load_data(valid_filename)
d_test_data, _ = load_data(test_filename)

d_train_data = withraw_empty_data(d_train_data)
d_valid_data = withraw_empty_data(d_valid_data)
d_test_data = withraw_empty_data(d_test_data)

d_train_data = d_train_data + d_valid_data

In [12]:
# Entrainer et sauvegarder le modèle NER
ner = train_spacy(d_train_data, d_train_label,10) # Entrainement
ner.to_disk(D_MODEL_PATH) # Sauvegarde du modèle

Iteration 1 Loss: {'ner': 8850.65990400113}
Iteration 2 Loss: {'ner': 4842.392546282146}
Iteration 3 Loss: {'ner': 3881.64614104738}
Iteration 4 Loss: {'ner': 3280.536102521087}
Iteration 5 Loss: {'ner': 2920.892490024116}
Iteration 6 Loss: {'ner': 2515.381699867579}
Iteration 7 Loss: {'ner': 2170.840337841357}
Iteration 8 Loss: {'ner': 2135.4220256501317}
Iteration 9 Loss: {'ner': 1900.4680607619925}
Iteration 10 Loss: {'ner': 1699.037633218844}


In [13]:
ner  = load_model(D_MODEL_PATH)

In [16]:
evaluate(ner, d_test_data)

                 p          r          f
Disease  89.625668  81.875916  85.575696
avg      89.625668  81.875916  85.575696


In [17]:
#exemple
test_sentences = [x[0] for x in d_test_data[0:15]]
for x in test_sentences:
    doc = s_ner(x)
    displacy.render(doc, jupyter = True, style = "ent")

  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


#Entrainement avec les données Science

In [18]:
#Charger les données issue de la science
train_filename, valid_filename, test_filename = load_filename("science")
s_train_data, s_train_label = load_data(train_filename)
s_valid_data, _ = load_data(valid_filename)
s_test_data, _ = load_data(test_filename)

s_train_data = withraw_empty_data(s_train_data)
s_valid_data = withraw_empty_data(s_valid_data)
s_test_data = withraw_empty_data(s_test_data)

s_train_data = s_train_data + s_valid_data # 

In [19]:
ner = train_spacy(s_train_data, s_train_label,10) # Entrainement
ner.to_disk(S_MODEL_PATH) # Sauvegarde du modèle

Iteration 1 Loss: {'ner': 19174.85461285302}
Iteration 2 Loss: {'ner': 14568.983588911226}
Iteration 3 Loss: {'ner': 12251.01532846503}
Iteration 4 Loss: {'ner': 10688.711106509369}
Iteration 5 Loss: {'ner': 9577.794111757587}
Iteration 6 Loss: {'ner': 8601.11243705476}
Iteration 7 Loss: {'ner': 7691.515825969992}
Iteration 8 Loss: {'ner': 6960.7573218514735}
Iteration 9 Loss: {'ner': 6321.494793040647}
Iteration 10 Loss: {'ner': 6037.368961081078}


In [20]:
ner  = load_model(S_MODEL_PATH)

In [21]:
evaluate(ner, s_test_data)

                             p          r          f
Material             62.500000  58.666667  60.522696
OtherScientificTerm  58.081705  53.694581  55.802048
Method               67.744681  67.629567  67.687075
Task                 66.055046  46.936115  54.878049
Generic              67.078189  63.424125  65.200000
Metric               72.164948  50.000000  59.071730
avg                  63.962691  57.524148  60.572805


In [22]:
#exemple
test_sentences = [x[0] for x in s_test_data[0:15]]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")