In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
# ! pip install cupy==7.8.0
# ! pip install Cython==0.28

! pip install -U spacy[cuda101]==2.3.5
# ! pip install preshed

# ! pip uninstall spacy

Requirement already up-to-date: spacy[cuda101]==2.3.5 in /usr/local/lib/python3.6/dist-packages (2.3.5)


In [2]:
import spacy
print(spacy.__version__)
gpu = spacy.require_gpu()
print('GPU:', gpu)

2.3.5
GPU: True


In [3]:
import plac
import logging
import argparse
import sys
import os
import json
import pickle
training_data = []

def main(input_file=None, output_file=None):
    try:
        
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))
    
        with open(output_file, 'wb') as fp:
            pickle.dump(training_data, fp)

    except Exception as e:
        logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e))
        return None
main("gdrive/MyDrive/Content/ner_corpus_260.json", "gdrive/MyDrive/Content/json_to_spacy.spacy")


In [8]:
%%writefile gpu_usage.sh
#! /bin/bash
#comment: run for 10 seconds, change it as per your use
end=$((SECONDS+10))

while [ $SECONDS -lt $end ]; do
    nvidia-smi --format=csv --query-gpu=power.draw,utilization.gpu,memory.used,memory.free,fan.speed,temperature.gpu >> gpu.log
    #comment: or use below command and comment above using #
    #nvidia-smi dmon -i 0 -s mu -d 1 -o TD >> gpu.log
done

Writing gpu_usage.sh


In [11]:
%%bash --bg

bash gpu_usage.sh

Starting job # 2 in a separate thread.


In [14]:
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# New entity labels
# Specify the new entity labels which you want to add here
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""
# Loading training data 
with open ('gdrive/MyDrive/Content/json_to_spacy.spacy', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)

# @plac.annotations(
#     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
#     new_model_name=("New model name for model meta.", "option", "nm", str),
#     output_dir=("Optional output directory", "option", "o", Path),
#     n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='new_model', output_dir=None,
         n_iter=25):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
        reset_weigths = False 
    else:
        ner = nlp.get_pipe('ner')
        reset_weights = True
    
    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None or reset_weights:
        optimizer = nlp.begin_training()
        print("Training begun")
    else:
        optimizer = nlp.entity.create_optimizer()     

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    print(len(TRAIN_DATA))
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}

            batches = minibatch(TRAIN_DATA, 
                            size=compounding(4., 128., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch) 
            # Updating the weights
                nlp.update(texts, annotations, sgd=optimizer, 
                       drop=0.35, losses=losses)
                # print(losses)
        
            print("*"*20)
            print(losses)

    # Test the trained model
    test_text = "Peter Strzok, the F.B.I. agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigations, has been fired."
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_,'-', ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_,'-', ent.text)

# if __name__ == '__main__':
#     plac.call(main)

main(model=None,
    new_model_name="ner_new",
    output_dir="ner_out",
    n_iter=25)

Created blank 'en' model
Training begun
1048575


  **kwargs
  **kwargs


********************
{'ner': 104680.25584310446}
********************
{'ner': 94191.99758931434}
********************
{'ner': 90920.22853269053}
********************
{'ner': 89082.45212023689}
********************
{'ner': 87856.99479971964}
********************
{'ner': 86931.5795251533}
********************
{'ner': 86058.55712457711}
********************
{'ner': 85787.25580748363}
********************
{'ner': 85184.41469945524}
********************
{'ner': 84880.47813810405}
********************
{'ner': 84467.49875361068}
********************
{'ner': 84127.74693301113}
********************
{'ner': 83935.50906732537}
********************
{'ner': 83543.31740390894}
********************
{'ner': 83303.04972089014}
********************
{'ner': 83099.46528229535}
********************
{'ner': 82990.55022321713}
********************
{'ner': 82942.89307262153}
********************
{'ner': 82506.54251249382}
********************
{'ner': 82453.95765366348}
********************
{'ner': 82308.54327

In [15]:
print(spacy.__version__)


2.3.5
