In [77]:
from io import open
from conllu import parse_incr


data = open("text-1.conllu", "r", encoding="utf-8")
sentences = []
for tokenlist in parse_incr(data):
    print(tokenlist)
    sentences.append(tokenlist)


TokenList<Hi, Ramesh, ,, This, is, with, reference, to, our, call, in, the, morning, .>
TokenList<As, discussed, ,, I, would, be, requiring, Melanalin, -, 109B, .>
TokenList<The, quantity, required, will, be, 100, ton, .>
TokenList<Hope, you, will, be, able, to, fulfill, this, order, .>
TokenList<Thanks, and, regards, ,, Suresh>


In [78]:
sentence = sentences[1]
sentence


TokenList<As, discussed, ,, I, would, be, requiring, Melanalin, -, 109B, .>

In [79]:
token = sentence[7]
token

{'id': 8,
 'form': 'Melanalin',
 'lemma': '_',
 'upos': '_',
 'xpos': None,
 'feats': None,
 'head': None,
 'deprel': '_',
 'deps': None,
 'misc': {'SpaceAfter': 'No'}}

In [80]:
sentence.metadata

{'text': 'As discussed, I would be requiring Melanalin-109B.'}

In [81]:
sentences = parse_incr(data, metadata_parsers={"tagset": lambda key, value: (key, value.split("|"))})
for sentence in sentences:
    print(sentence.metadata)

### NER 

In [1]:
import csv, pprint


class convert_conll2spacy(object):

    def __init__(self, file):
        self.file = file

    def convert(self):
        #print("Start Conversion")
        with open(self.file, 'r') as devset:
            content = csv.reader(devset, delimiter=' ', skipinitialspace=True, quotechar=None)

            text_as_list = []
            sentence_as_list = []
            entities = []
            sentences_as_plain_text = ""
            i = 0
            tokenized_list = []

            for row in content:
                #print(len(row))
                if len(row) == 0:
                    tokenized_list.append(" ")
                else:
                    tokenized_list.append(row[0])
                if len(row) == 2:
                    if 'B-PROD' in row[1]:
                        start = i
                        end = i+len(row[0])
                        entities.append((start, end, 'B-PROD'))
                    if 'I-PROD' in row[1]:
                        start = i
                        end = i+len(row[0])
                        entities.append((start, end, 'I-PROD'))
                    if 'B-QUAN' in row[1]:
                        start = i
                        end = i+len(row[0])
                        entities.append((start, end, 'B-QUAN'))
                    if 'I-QUAN' in row[1]:
                        start = i
                        end = i+len(row[0])
                        entities.append((start, end, 'I-QUAN'))
                    
                    
                    sentence_as_list.append(row[0])
                    i += len(row[0])+1

                elif len(row) == 0:
                    i = 0
                    sentence = " ".join(sentence_as_list)
                    sentences_as_plain_text += sentence
                    add_sent_ne_to_list = (sentence, entities)
                    text_as_list.append(add_sent_ne_to_list)
                    sentence_as_list = []
                    entities = []

        #pprint.pprint(text_as_list)
        #print("Conversion done!")
        return text_as_list, sentences_as_plain_text, tokenized_list

In [2]:
Cv = convert_conll2spacy("./annotated emails/email_1.conll")
text_as_list, sentences_as_plain_text, tokenized_list = Cv.convert()


In [3]:
print (text_as_list)

[('Hello , Manager I am looking for differents products from you .', []), ('Their quantity and products Names are 8,621 x Acetylene , 4,752 x Grignard , 4,175 x Ethoxide , 5,247 x Methylene , 4,681 x Benzolide .', [(38, 43, 'B-QUAN'), (46, 55, 'B-PROD'), (58, 63, 'B-QUAN'), (66, 74, 'B-PROD'), (77, 82, 'B-QUAN'), (85, 93, 'B-PROD'), (96, 101, 'B-QUAN'), (104, 113, 'B-PROD'), (116, 121, 'B-QUAN'), (124, 133, 'B-PROD')]), ('Very truly yours , Mayur', [])]


In [4]:
def convert2json(text_as_list):
    TRAIN_DATA = []
    for text in text_as_list:
            mydict = {}
        #if len(text[1]) !=0:
            sentence = text[0]
            entities = text[1]
            mydict["entities"] = entities
            temp = []
            temp.append(sentence)
            temp.append(mydict)
            TRAIN_DATA.append(tuple(temp))
    return TRAIN_DATA

In [5]:
convert2json(text_as_list)

[('Hello , Manager I am looking for differents products from you .',
  {'entities': []}),
 ('Their quantity and products Names are 8,621 x Acetylene , 4,752 x Grignard , 4,175 x Ethoxide , 5,247 x Methylene , 4,681 x Benzolide .',
  {'entities': [(38, 43, 'B-QUAN'),
    (46, 55, 'B-PROD'),
    (58, 63, 'B-QUAN'),
    (66, 74, 'B-PROD'),
    (77, 82, 'B-QUAN'),
    (85, 93, 'B-PROD'),
    (96, 101, 'B-QUAN'),
    (104, 113, 'B-PROD'),
    (116, 121, 'B-QUAN'),
    (124, 133, 'B-PROD')]}),
 ('Very truly yours , Mayur', {'entities': []})]

In [6]:
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [10]:

def training(model=None, output_dir=None, n_iter=100):
    TRAIN_DATA = []
    for i in range(1,22):
        file_name = "./annotated emails/email_"+str(i)+".conll"
        Cv = convert_conll2spacy(file_name)
        text_as_list, sentences_as_plain_text, tokenized_list = Cv.convert()
        TRAIN = convert2json(text_as_list)
        TRAIN_DATA.extend(TRAIN)
    print(len(TRAIN_DATA))
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
       # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    print(output_dir)
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
           # print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
        #test_text = 'We will required 50 tons of Methylin'
        #doc = nlp(test_text)
        #print("Entities in '%s'" % test_text)
        #for ent in doc.ents:
         #   print(ent.label_, ent.text)




In [11]:
def testing(s):
    nlp = spacy.load(output_dir)
    doc = nlp(s)
    print("Entities in '%s'" % s)
    for ent in doc.ents:
        print(ent.label_, ent.text)
    

In [12]:
if __name__ == "__main__":
   # plac.call(main)
    output_dir="./nlp"
    training(output_dir = './nlp')
    s = input("Enter any text: ")
    testing(s)
    

86
Created blank 'en' model


  proc.begin_training(


Losses {'ner': 468.7806473376113}
Losses {'ner': 191.97906185360625}
Losses {'ner': 144.03741665799149}
Losses {'ner': 84.48628679424885}
Losses {'ner': 64.98362858186886}
Losses {'ner': 40.8949210172205}
Losses {'ner': 24.088203574147425}
Losses {'ner': 23.80790276051456}
Losses {'ner': 20.470228893705997}
Losses {'ner': 9.36006137107237}
Losses {'ner': 11.405717174735658}
Losses {'ner': 6.510289813501757}
Losses {'ner': 4.417241689669931}
Losses {'ner': 8.744494100379768}
Losses {'ner': 10.032167074484224}
Losses {'ner': 5.0593042704165505}
Losses {'ner': 11.229817473524681}
Losses {'ner': 6.57498925865661}
Losses {'ner': 4.211313066415103}
Losses {'ner': 2.909562610733797}
Losses {'ner': 4.740391844110788}
Losses {'ner': 5.9740649112128885}
Losses {'ner': 9.410325130419821}
Losses {'ner': 4.091918190700192}
Losses {'ner': 7.555363493755213}
Losses {'ner': 4.316653373438908}
Losses {'ner': 2.408570138004756}
Losses {'ner': 9.421942165590224}
Losses {'ner': 1.2977967971710962}
Losses 

In [19]:
testing(" Hi Tarun, need  100, 400 and 300 tons of Epoxy, Boxofine and Ramflyn respectively. Hope you can deliver on time. Thanks and regards, Anisha")

Entities in ' Hi Tarun, need  100, 400 and 300 tons of Epoxy, Boxofine and Ramflyn respectively. Hope you can deliver on time. Thanks and regards, Anisha'
B-QUAN 100
B-QUAN 400
B-QUAN 300
B-PROD Epoxy
B-PROD Boxofine
B-PROD Ramflyn


In [18]:
s = input("Enter mail: ")
testing(s)

Enter mail: Hi Tarun, need  100, 400 and 300 tons of Epoxy, Boxofine and Ramflyn respectively. Hope you can deliver on time. Thanks and regards, Anisha
Entities in 'Hi Tarun, need  100, 400 and 300 tons of Epoxy, Boxofine and Ramflyn respectively. Hope you can deliver on time. Thanks and regards, Anisha'
B-QUAN 100
B-QUAN 400
B-QUAN 300
B-PROD Epoxy
B-PROD Boxofine
B-PROD Ramflyn
