# Exploration des annotations de Watson

In [None]:
#import tarfile
#tf = tarfile.open("libex.tar.gz")
#tf.extractall()

In [2]:
import os
import json
import numpy as np
import random
from tqdm import tqdm
from operator import itemgetter
import matplotlib.pyplot as plt

In [3]:
import spacy
from spacy import displacy

## Documents

`Watson` permet de retrouver le corpus de textes utilisés pour un entrainement (le dernier a priori).

In [15]:
corpus = 'd67b6df0-ea74-11e9-8ff4-f146741a0385'

In [21]:
def watsonsets(corpus):
    ws_json = os.path.join('../libex/data', 'corpus-%s' % corpus, 'sets.json')
    print("watsonset file is '%s'" % ws_json)
    with open(ws_json) as f:
        return json.load(f)

In [22]:
def watsonset(ws, wskey):
    return [d for s in ws if s['name']==wskey for d in s['documents']]

In [23]:
def watsondocs(corpus, ws, wskey):
    wdocs = []
    for d in watsonset(ws, wskey):
        json_file = os.path.join('../libex/data', 'corpus-%s' % corpus, 'gt', '%s.json' % d)
        with open(json_file) as f:
            wdocs.append(json.load(f))
    print('%s documents for %s' % (len(wdocs), wskey))
    return wdocs

In [24]:
def train_test_from_watson(corpus):
    ws = watsonsets(corpus)
    # train set
    train_wd = watsondocs(corpus, ws, 'Training')
    # test set
    test_wd = watsondocs(corpus, ws, 'Test')
    return train_wd, test_wd

In [25]:
train_wd, test_wd = train_test_from_watson(corpus)

watsonset file is '../libex/data/corpus-d67b6df0-ea74-11e9-8ff4-f146741a0385/sets.json'
37 documents for Training
12 documents for Test


On affiche les labels : 

In [27]:
ontology_json = os.path.join('../libex/data', 'types_PV_SA.json')
with open(ontology_json) as f:
    ontology = json.load(f)
    
entities = [e['label'] for e in ontology['entityTypes']]

print(entities)

['Locution', 'PV_unit', 'PV_val_max', 'PV_val_min', 'SA_unit', 'SA_val_max', 'SA_val_min', 'Pore_volume', 'Surface_area', 'Support', 'Catalyst']


## Watson to SpaCy

On convertit du format `Watson` au format `SpaCy`.

In [88]:
def watson2spacy(watson_doc, verbose=False):
    # split, remove last empty sentence
    # nb: could use nltk/spacy but need to match Watson
    sentences = watson_doc['text'].split('\n')[:-1]
    # cumulative lengths of sentence
    slen = [len(s)+1 for s in sentences]
    cum_slen = np.append([0], np.cumsum(slen))
    # spacy doc from sentences
    spacy_doc = [(s, {'entities': []}) for s in sentences]
    # loop over annotated entities from watson
    for m in watson_doc['mentions']:
        # position indexes in the whole text
        begin = m['begin']
        end = m['end']
        # find the sentence
        sid = np.argmin(cum_slen < m['end'])-1
        # offset to position in sentence
        offset = cum_slen[sid]
        # translate from text to sentence
        begin -= offset
        end -= offset
        # fill spacy doc
        spacy_doc[sid][1]['entities'].append((begin,end,m['type']))
               
    # spacy cant deal with overlapping entities
    
    for s,d in spacy_doc:
        entities = d['entities']
        types = set([t for _,_,t in entities])
        # fix overlapping of same type by fusion
        for t in types:
            chunks = [(b,e, mt) for b,e,mt in entities if mt == t]
            chunks = sorted(chunks, key=itemgetter(0))
            for i, c in enumerate(chunks[:-1]):
                nc = chunks[i+1]
                if c[1] > nc[0]:
                    if verbose:
                        print("Overlapping in doc '%s' :" % watson_doc['name'])
                        print("'%s' at %s and '%s' at %s for type '%s'" % (s[c[0]:c[1]], c, s[nc[0]:nc[1]], nc, t))
                        print("sentence is '%s'" % s)
                    fix_c = (min(c[0], nc[0]), max(c[1], nc[1]), nc[2])
                    entities.remove(c)
                    entities[entities.index(nc)] = fix_c
                    chunks[i+1] = fix_c # to continue loop with up to date version 
    
    return spacy_doc

In [89]:
train_sd = [watson2spacy(d) for d in train_wd]
train_sd = [d for l in train_sd for d in l]

test_sd = [watson2spacy(d) for d in test_wd]
test_sd = [d for l in test_sd for d in l]

In [90]:
train_sd[0:4]

[('I claim as my invention:', {'entities': []}),
 ('1. A method of impregnating a support which comprises contaeting a hydrogel of a refractory oxide with an aqueous solution of a heat decomposable tungsten compound in the presence of a ferrous salt.',
  {'entities': [(30, 37, 'Support')]}),
 ('2. The method of claim 1 wherein the solution consists essentially of both ferrous salt and the tungsten compound.',
  {'entities': []}),
 ('3. The method of claim 1 wherein fluoride is incorporated into the hydrogel by adding a water - soluble inorganic fluoride salt or hydrofluoric acid.',
  {'entities': []})]

On crée un dataset de validation.

In [91]:
n_valid = int(len(test_sd)/2) #97
valid_sd = test_sd[:n_valid]

In [97]:
n_entities = [0]*len(entities)
for s, d in test_sd: 
    if d['entities']:
        for i in d['entities']:
            for ent in entities:
                n_entities[entities.index(ent)] += i[2].count(ent)  

In [107]:
entities

['Locution',
 'PV_unit',
 'PV_val_max',
 'PV_val_min',
 'SA_unit',
 'SA_val_max',
 'SA_val_min',
 'Pore_volume',
 'Surface_area',
 'Support',
 'Catalyst']

In [119]:
import pandas as pd
d = {'entities' : entities, 'nb' : n_entities}
df_nb_ent = pd.DataFrame(d)

In [120]:
df_nb_ent

Unnamed: 0,entities,nb
0,Locution,69
1,PV_unit,11
2,PV_val_max,6
3,PV_val_min,9
4,SA_unit,17
5,SA_val_max,5
6,SA_val_min,14
7,Pore_volume,14
8,Surface_area,18
9,Support,86


## SpaCy to conll

On convertit du format `SpaCy` au format `conll` où chaque ligne correspond à un mot et son entité séparés par une tabulation, et les phrases sont séparées par un saut de ligne.
    
```
I    O
claim    O
as    O
my    O
invention:    O

1.    O
A    O
method    O
of    O
impregnating    O
a    O
support    B-Support
which    O

```

La fonction ci-dessous sera utilisée pour vérifiée que le nombre d'entités dans le dataset spacy est le même que dans le dataset conll.

In [32]:
def entites_spacy(d, s):
    c = 0
    wentities =[]
    ent = []
    if d['entities'] != []:
        for l in range(len(d['entities'])):
            b = d['entities'] [l][0]
            end = d['entities'][l][1]
            ent.append(d['entities'][l][2])
            w = s[b:end]
            wentities.append(w)
            c = c + 1
    return c, wentities, ent

In [33]:
import pandas as pd
import csv
import re 
def spacy2conll(spacy_doc):    
    # liste des entités
    ent = [] 
    # liste des revendications
    sent = [] 
    phr = []
    # on compte les phrases qui n'ont pas le bon nombre d'entités
    count = 0
    for s, d in spacy_doc:    
        # supprimer espaces multiples          
        s = " ".join(s.split())
        # transformer la chaîne de caractère en liste
        tab_s = s.split(" ")
        ls = len(tab_s) 
        # on initialise une liste d'entité pour chaque phrase à 'O'
        e = ['O'] * (ls)    
        # ajout d'une ligne vide à la fin de chaque phrase

        count = count + 1 
        phrases = [count] * (ls)

        entities = d['entities'] 
        EE = []
        if entities:  
            # liste des mots correspondant à une entité 
            words = []
            # Exception pour mot contenant "of>"
            OF = False
            PHRASE = False
            for l in range(len(entities)):
                # début de l'entité
                b = entities[l][0]
                # fin de l'entité
                end = entities[l][1] 
                if end > len(s):
                    end = len(s)
                # si l'entité se termine par un espace ou une ponctuation
                punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''   
                if s[end-1] == ' ' or s[end-1] in punctuations:
                    b = b - 1
                    w = s[b:end-1]
                # si l'entité débute par un espace ou une ponctuation
                if s[b] == ' ' or s[b] in punctuations:
                    b = b + 1
                    w = s[b:end-1]
                # si l'entité se termine par un caractère seul
                char = re.compile(" [a-zA-Z-/*.;]$")  
                if char.findall(s[b:end]) != []:                    
                    b = b - 2
                    end = end-2
                    w = s[b:end] 
                char = re.compile(" [a-zA-Z-/*.;]$")  
                if char.findall(s[b:end]) != []:                    
                    b = b - 2
                    end = end-2
                    w = s[b:end] 
                w = re.sub(r"^\s+|\s+$", "", s[b:end]) 
                # exception of> : 
                if "f>" in s[b:end] :
                    OF = True        
                if PHRASE == True:
                    b = b + n + 1  
                if w == "ilica,":
                    b = b-1
                    end = end-2
                words.append(w)
                
                # si l'entité est composé d'un seul mot
                w_tab = [i for i in w.split(' ') if i != '' and i not in punctuations]  
                if len(w_tab) == 1:
                    for i, j in enumerate(tab_s):
                        if OF:
                            if "of>" in tab_s[i]:
                                del e[i]
                                e.insert(i,"B-" + entities[l][2]) 
                                tab = [i.replace(">", " ") for i in tab_s if "of>" in i]                           
                                tab_s.insert(i+1, tab[0].split(' ')[1]) 
                                del e[i+1]
                                e.insert(i+1,"B-" + entities[l+1][2])    
                                e.insert(ls, 'O') 
                            
                                PHRASE = True
                                n = len(tab[0].split(' ')[1])
                                OF = False
                        if w.replace(" ", "") in j and b == len(' '.join(tab_s[0:i])) + 1: # si plusieurs fois le même mot
                            del e[i]
                            e.insert(i,"B-" + entities[l][2])
                          
                            if w == 'silica':
                                EE.append((entities[l][2], w))
                            EE.append((entities[l][2], w))             
                else:
                    # Si l'entité est composé de plusieurs mots
                    ind = -1
                    t = w.split(' ')
                    t = [i for i in t if i != ' ']
                    # premier mot
                    for i, j in enumerate(tab_s): 
                        if t[0] in j and b == len(' '.join(tab_s[0:i])) + 1:
                            del e[i]
                            e.insert(i, "B-" + entities[l][2])
                            ind = i
                            EE.append((entities[l][2], w))
                            for wo in t[1:]:
                                for m, k in enumerate(tab_s):                         
                                    if wo in k and m == ind + 1:
                                        del e[m]
                                        e.insert(m, "I-" + entities[l][2]) 
                                        ind = m  
            # Vérification même nombre d'entités
    
        # On vérifie que chaque mot a une entité
        assert len(e) == len(tab_s) 
        phr.append(phrases)
        ent.append(e)
        sent.append(tab_s)
        
    p = [item for sublist in phr for item in sublist]
    tokens = [item for sublist in sent for item in sublist]
    tag = [item for sublist in ent for item in sublist]

    if len(p) != len(tag):
        n = len(tag) - len(p)
        for i in range(n):
            p.append(count)
      
    data = pd.DataFrame({'Sentence #': p, 'text':tokens, 'tag': tag})
    return data                      

In [34]:
data_train = spacy2conll(train_sd)
data_test = spacy2conll(test_sd)

In [35]:
data_train

Unnamed: 0,Sentence #,text,tag
0,1,I,O
1,1,claim,O
2,1,as,O
3,1,my,O
4,1,invention:,O
...,...,...,...
37160,684,methyl,O
37161,684,-,O
37162,684,2,O
37163,684,-,O


On sauvegarde le tout au format tsv.

In [36]:
data_train.to_csv (r'data_brevets/data_train.csv', index = False, header=True)
data_test.to_csv (r'data_brevets/data_test.csv', index = False, header=True)

In [63]:
len(p)

46700

On crée le même format de dataset pour le jeu de données CoNLL-2003.

In [67]:
import csv
import pandas as pd
def conlltobert(filename):
    ent = []
    words = []
    nb = []
    phr = []
    w = []
    s = 1
    with open('data_ner/'+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for row in spamreader:
            if row:
                phr.append(row[0])
                w.append(row[0])
                ent.append(row[-1])
            if row == []:
                words.append(phr)
                n = len(words[s-1])
                nb.append([s]*n)
                s = s + 1
                phr = []

        nb.append([s]*len(phr))

    #w = [item for sublist in words for item in sublist]            
    p = [item for sublist in nb for item in sublist]    
    data = pd.DataFrame({'Sentence #': p, 'text':w, 'tag': ent})
    return data

In [68]:
data_train_c = conlltobert('train.txt')
data_test_c = conlltobert('test.txt')
data_valid_c = conlltobert('valid.txt')

In [73]:
test = pd.concat([data_valid_c,data_test_c])

In [74]:
data_train_c.to_csv (r'data_ner/data_train.csv', index = False, header=True)
test.to_csv (r'data_ner/data_test.csv', index = False, header=True)