# Training Custom NER Model for Chemical Compounds

### Importing all the necessary Packages

In [1]:
import spacy
import xml.etree.ElementTree as et
import os
from spacy import displacy
import pandas as pd
from spacy.lang.en import English
from stop_words import get_stop_words
import pickle
from tqdm.notebook import tqdm


### Utility functions for data extraction 

In [2]:
def get_abstract(element):
    s = ""
    for i in element:
        if i.tag == 'p':
            for j in i:
                s += j.tail.strip()
    
    return s
def get_description(element):
    s = ""
    for i in element:
        if i.tag == 'heading':
            continue

        if i.tag == 'p':
            s += i.text.strip()
            for j in i:
                s += j.tail.strip()
    return s

## Forming corpus for training

In [3]:
data = dict()
for filename in os.listdir(os.getcwd()+'/uspat1_201831_back_80001_100000'):
    
    if filename.endswith(".xml"):
        data[filename] = dict()
        tree = et.parse(os.getcwd()+'/uspat1_201831_back_80001_100000/'+filename)
        element = tree.findall('abstract/')
        abstract_data = get_abstract(element)
        element = tree.findall('description/')
        desc_data = get_description(element)
        data[filename] ['abstract'] = abstract_data
        data[filename] ['description'] = desc_data

Output with inbuilt NER model on SPacy

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
doc = nlp(data['US06176565B1.xml']['description'])

In [None]:
ents = [(e.text, e.label_) for e in doc.ents]

### Extracting Chemical compounds from DBPedia data 

In [9]:
df = pd.read_csv('ChemicalSubstance.csv')

In [10]:
chemical_name = df['rdf-schema#label'].values[3:]

#### Getting all the stopwords

In [11]:
stops = get_stop_words("english")

#### Function for normalizing text

In [12]:
def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

In [15]:
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

#### Normalising Data

In [16]:
new_data = dict()
for doc,val in tqdm(data.items()):
    new_data[doc] = dict()
    new_data[doc]['abstract'] = normalize(val['abstract'],lowercase=True, remove_stopwords=True)
    new_data[doc]['description'] =  normalize(val['description'],lowercase=True, remove_stopwords=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7173.0), HTML(value='')))




In [18]:
pickle.dump(new_data,open('processed.pkl','wb'))

In [19]:
chemical_name_lower = [str(i).lower() for i in chemical_name]

### Utility method for BILOU Tagging

In [22]:
def get_ent(text,chemical_name_lower):
    ent_list = []
    c = 0
    for i in text:
        if i in chemical_name_lower:
            ent_list.append('U-CHEMICAL')
        else:
            ent_list.append('O')
    return ent_list

### Tagging the data for training 

In [23]:
entities = []
ent_data = tuple()
for k,v in tqdm(new_data.items()):
    ent_data = (v['abstract'],get_ent(v['abstract'].split(),chemical_name_lower ))
    entities.append(ent_data)
    ent_data = (v['description'],get_ent(v['description'].split(),chemical_name_lower ))
    entities.append(ent_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7173.0), HTML(value='')))




In [24]:
pickle.dump(entities,open('entities.pkl','wb'))

In [41]:
#entities = pickle.load(open( 'ent.pkl', "rb" ))

### Training Custom NER model on the processed data

#### Create an empty spacy model and set the pipes and label for NER

In [26]:
nlp = spacy.blank("en")

In [27]:
ner = nlp.create_pipe("ner")

In [28]:
nlp.add_pipe(ner,last= True)

In [29]:
ner.add_label("CHEMICAL")

In [31]:
training = []
optimizer = nlp.begin_training()

In [44]:
from spacy.gold import offsets_from_biluo_tags
a = dict()
for itn in tqdm(range(1)):
         losses = {}
         for text, annotations in tqdm(entities):
             #a = {'entities': annotations}
             if offsets_from_biluo_tags(nlp(text), annotations) == []:
                 continue
             a['entities'] = offsets_from_biluo_tags(nlp(text), annotations)
             nlp.update([text], [a], sgd=optimizer, drop=0.35,losses=losses)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9552.0), HTML(value='')))





In [45]:
nlp.to_disk(os.getcwd() + "/chem_model")

In [46]:
nlp = spacy.load('chem_model')

In [47]:
doc = nlp(new_data['US06176565B1.xml']['description'])

In [48]:
displacy.serve(doc, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [49]:
ents = [(e.text, e.label_) for e in doc.ents]

In [50]:
ents

[('lactone', 'CHEMICAL'),
 ('b821', 'CHEMICAL'),
 ('diphenylmethane', 'CHEMICAL'),
 ('tolylene', 'CHEMICAL'),
 ('naphthalene', 'CHEMICAL'),
 ('diphenylmethane', 'CHEMICAL'),
 ('ethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('hydroquinone', 'CHEMICAL'),
 ('trimethylolpropane', 'CHEMICAL'),
 ('trimethylolethane', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('ethylene', 'CHEMICAL'),
 ('trimethylolpropane', 'CHEMICAL'),
 ('trimethylolpropane', 'CHEMICAL'),
 ('trimethylolpropane', 'CHEMICAL'),
 ('compose', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('ethylene', 'CHEMICAL'),
 ('polyethylene', 'CHEMICAL'),
 ('ethylene', 'CHEMICAL'),
 ('ethylene', 'CHEMICAL'),
 ('compose', 'CH

Successfully trained the model for chemical compounds but the accuracy is quiet low. It is because, I have trained the model only once because of computing power. If we train the same on GPU we can achieve some good accuracy.