In [None]:
import spacy
nlp = spacy.load('en_core_web_sm') # language model
import pandas as pd
import numpy as np
import random

In [None]:
text = 'Tesla, Inc. is an American electric vehicle and clean energy company based in Palo Alto, California, United States. The company announced plans to move its headquarters to Austin, Texas.'

In [None]:
doc = nlp(text)

In [None]:
print(doc)

Tesla, Inc. is an American electric vehicle and clean energy company based in Palo Alto, California, United States. The company announced plans to move its headquarters to Austin, Texas.


In [None]:
temp_df = pd.DataFrame([(x.text, x.pos_, x.tag_, x.dep_, x.lemma_, x.is_stop) for x in doc], 
                       columns = ['Text', 'Postag', 'Tag', 'Relationship', 'Lemmatize', 'Stopword'])

In [None]:
temp_df

Unnamed: 0,Text,Postag,Tag,Relationship,Lemmatize,Stopword
0,Tesla,PROPN,NNP,npadvmod,Tesla,False
1,",",PUNCT,",",punct,",",False
2,Inc.,PROPN,NNP,nsubj,Inc.,False
3,is,AUX,VBZ,ROOT,be,True
4,an,DET,DT,det,an,True
5,American,ADJ,JJ,amod,american,False
6,electric,ADJ,JJ,amod,electric,False
7,vehicle,NOUN,NN,attr,vehicle,False
8,and,CCONJ,CC,cc,and,True
9,clean,ADJ,JJ,amod,clean,False


In [None]:
entity_details = pd.DataFrame([(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
                              columns= ['Text', 'START', 'END', 'LABELS'])

In [None]:
entity_details

Unnamed: 0,Text,START,END,LABELS
0,"Tesla, Inc.",0,11,ORG
1,American,18,26,NORP
2,Palo Alto,78,87,GPE
3,California,89,99,GPE
4,United States,101,114,GPE
5,Austin,172,178,GPE
6,Texas,180,185,GPE


In [None]:
import spacy.displacy as displacy
displacy.render(doc, style='dep', jupyter=True)

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
ner = nlp.get_pipe('ner')

In [None]:
train_dataset = [
      ('Money transfer from my savings account is not working', {'entities':[(0, 14, 'ACTIVITY'), (23, 38, 'PRODUCT')]}),
      ('I want to check balance in my savings account', {'entities':[(10, 15, 'ACTIVITY'), (30, 45, 'PRODUCT')]}),
      ('I suspect a fraud in my credit card account', {'entities':[(12, 17, 'ACTIVITY'), (24, 35, 'PRODUCT')]}),
      ('I am here for opening a new savings account', {'entities':[(14, 21, 'ACTIVITY'), (28, 43, 'PRODUCT')]}),
      ('Your mortgage is in delinquent status', {'entities':[(20, 30, 'ACTIVITY'), (5, 13, 'PRODUCT')]}),
      ('Your credit card is in past due status', {'entities':[(30, 41, 'ACTIVITY'), (5, 16, 'PRODUCT')]}),
      ('My loan account is still not approved and funded', {'entities':[(25, 37, 'ACTIVITY'), (3, 15, 'PRODUCT')]}),
      ('How do I open a new loan account', {'entities':[(9, 13, 'ACTIVITY'), (20, 32, 'PRODUCT')]}),
      ('What are the charges on Investment account', {'entities':[(13, 20, 'ACTIVITY'), (24, 42, 'PRODUCT')]}),
      ('Can you explain late charges on my credit card', {'entities':[(16, 28, 'ACTIVITY'), (35, 49, 'PRODUCT')]}),
      ('I want to open a new loan amount', {'entities':[(10, 14, 'ACTIVITY'), (21, 33, 'PRODUCT')]}),
      ('Can you help updating payment on my credit card', {'entities':[(22, 29, 'ACTIVITY'), (36, 47, 'PRODUCT')]}),
      ('When is the payment due date on my card', {'entities':[(12, 19, 'ACTIVITY'), (35, 39, 'PRODUCT')]})
]

In [None]:
train_dataset[1][1].get('entities')[0]

(10, 15, 'ACTIVITY')

In [None]:
ner = nlp.get_pipe('ner')
for _, annotations in train_dataset:
  print(annotations)
  for ent in annotations.get('entities'):
    ner.add_label(ent[2])

{'entities': [(0, 14, 'ACTIVITY'), (23, 38, 'PRODUCT')]}
{'entities': [(10, 15, 'ACTIVITY'), (30, 45, 'PRODUCT')]}
{'entities': [(12, 17, 'ACTIVITY'), (24, 35, 'PRODUCT')]}
{'entities': [(14, 21, 'ACTIVITY'), (28, 43, 'PRODUCT')]}
{'entities': [(20, 30, 'ACTIVITY'), (5, 13, 'PRODUCT')]}
{'entities': [(30, 41, 'ACTIVITY'), (5, 16, 'PRODUCT')]}
{'entities': [(25, 37, 'ACTIVITY'), (3, 15, 'PRODUCT')]}
{'entities': [(9, 13, 'ACTIVITY'), (20, 32, 'PRODUCT')]}
{'entities': [(13, 20, 'ACTIVITY'), (24, 42, 'PRODUCT')]}
{'entities': [(16, 28, 'ACTIVITY'), (35, 49, 'PRODUCT')]}
{'entities': [(10, 14, 'ACTIVITY'), (21, 33, 'PRODUCT')]}
{'entities': [(22, 29, 'ACTIVITY'), (36, 47, 'PRODUCT')]}
{'entities': [(12, 19, 'ACTIVITY'), (35, 39, 'PRODUCT')]}


In [None]:
from tqdm import tqdm
disable_pipe = [pipe for pipe in nlp.pipe_names if pipe!='ner']
with nlp.disable_pipes(*disable_pipe):
  optimizer =   nlp.begin_training()
  for itn in range(100):
    random.shuffle(train_dataset)
    losses = {}
    for text, annotations in tqdm(train_dataset):
      nlp.update(
          [text],
          [annotations],
          drop=0.5,
          sgd = optimizer,
          losses = losses)   
    print(losses)

100%|██████████| 13/13 [00:00<00:00, 26.43it/s]


{'ner': 113.07936515467536}


100%|██████████| 13/13 [00:00<00:00, 26.42it/s]


{'ner': 107.9795924148516}


100%|██████████| 13/13 [00:00<00:00, 26.45it/s]


{'ner': 112.20412764742832}


100%|██████████| 13/13 [00:00<00:00, 25.03it/s]


{'ner': 88.87376726185903}


100%|██████████| 13/13 [00:00<00:00, 25.36it/s]


{'ner': 75.1546391629272}


100%|██████████| 13/13 [00:00<00:00, 25.53it/s]


{'ner': 93.34196401263034}


100%|██████████| 13/13 [00:00<00:00, 25.11it/s]


{'ner': 97.85690017593151}


100%|██████████| 13/13 [00:00<00:00, 24.75it/s]


{'ner': 89.59996363613755}


100%|██████████| 13/13 [00:00<00:00, 25.46it/s]


{'ner': 92.08074621949345}


100%|██████████| 13/13 [00:00<00:00, 25.29it/s]


{'ner': 94.44815315585583}


100%|██████████| 13/13 [00:00<00:00, 25.66it/s]


{'ner': 90.54312379185285}


100%|██████████| 13/13 [00:00<00:00, 24.85it/s]


{'ner': 87.62882887382574}


100%|██████████| 13/13 [00:00<00:00, 25.31it/s]


{'ner': 92.28621137235314}


100%|██████████| 13/13 [00:00<00:00, 24.43it/s]


{'ner': 87.79832853097469}


100%|██████████| 13/13 [00:00<00:00, 25.66it/s]


{'ner': 87.82775439650835}


100%|██████████| 13/13 [00:00<00:00, 25.31it/s]


{'ner': 89.51243474241346}


100%|██████████| 13/13 [00:00<00:00, 24.85it/s]


{'ner': 78.00391511234719}


100%|██████████| 13/13 [00:00<00:00, 24.25it/s]


{'ner': 85.91019174922258}


100%|██████████| 13/13 [00:00<00:00, 24.72it/s]


{'ner': 89.89278247718634}


100%|██████████| 13/13 [00:00<00:00, 24.33it/s]


{'ner': 83.61783270954402}


100%|██████████| 13/13 [00:00<00:00, 24.91it/s]


{'ner': 90.92726388387382}


100%|██████████| 13/13 [00:00<00:00, 24.53it/s]


{'ner': 79.09206296391386}


100%|██████████| 13/13 [00:00<00:00, 25.31it/s]


{'ner': 89.02877461584285}


100%|██████████| 13/13 [00:00<00:00, 24.57it/s]


{'ner': 91.66204603856022}


100%|██████████| 13/13 [00:00<00:00, 24.08it/s]


{'ner': 81.42475690506399}


100%|██████████| 13/13 [00:00<00:00, 25.58it/s]


{'ner': 85.24718673503253}


100%|██████████| 13/13 [00:00<00:00, 25.20it/s]


{'ner': 89.14212850209219}


100%|██████████| 13/13 [00:00<00:00, 25.83it/s]


{'ner': 91.23183580921614}


100%|██████████| 13/13 [00:00<00:00, 24.88it/s]


{'ner': 80.33292561583221}


100%|██████████| 13/13 [00:00<00:00, 24.69it/s]


{'ner': 86.85670787453346}


100%|██████████| 13/13 [00:00<00:00, 24.06it/s]


{'ner': 89.53556828146975}


100%|██████████| 13/13 [00:00<00:00, 24.23it/s]


{'ner': 96.62020789086819}


100%|██████████| 13/13 [00:00<00:00, 24.92it/s]


{'ner': 82.85467815462692}


100%|██████████| 13/13 [00:00<00:00, 24.55it/s]


{'ner': 84.29312124479111}


100%|██████████| 13/13 [00:00<00:00, 25.21it/s]


{'ner': 90.07672809856012}


100%|██████████| 13/13 [00:00<00:00, 25.36it/s]


{'ner': 79.10201393766329}


100%|██████████| 13/13 [00:00<00:00, 24.18it/s]


{'ner': 79.93899281055201}


100%|██████████| 13/13 [00:00<00:00, 25.49it/s]


{'ner': 89.14487367531137}


100%|██████████| 13/13 [00:00<00:00, 25.45it/s]


{'ner': 88.22622527513886}


100%|██████████| 13/13 [00:00<00:00, 25.46it/s]


{'ner': 83.46602181101214}


100%|██████████| 13/13 [00:00<00:00, 25.24it/s]


{'ner': 77.5016157538048}


100%|██████████| 13/13 [00:00<00:00, 24.93it/s]


{'ner': 89.4848497970961}


100%|██████████| 13/13 [00:00<00:00, 24.97it/s]


{'ner': 78.39540030857916}


100%|██████████| 13/13 [00:00<00:00, 25.75it/s]


{'ner': 81.55603521619923}


100%|██████████| 13/13 [00:00<00:00, 25.46it/s]


{'ner': 82.38800556044828}


100%|██████████| 13/13 [00:00<00:00, 24.72it/s]


{'ner': 79.12060554225172}


100%|██████████| 13/13 [00:00<00:00, 24.91it/s]


{'ner': 86.93211889966267}


100%|██████████| 13/13 [00:00<00:00, 25.44it/s]


{'ner': 74.82677642149974}


100%|██████████| 13/13 [00:00<00:00, 26.04it/s]


{'ner': 84.40302540754783}


100%|██████████| 13/13 [00:00<00:00, 25.27it/s]


{'ner': 96.18418283779101}


100%|██████████| 13/13 [00:00<00:00, 25.12it/s]


{'ner': 82.54161280044264}


100%|██████████| 13/13 [00:00<00:00, 25.49it/s]


{'ner': 78.65260927973634}


100%|██████████| 13/13 [00:00<00:00, 26.21it/s]


{'ner': 88.22132503299508}


100%|██████████| 13/13 [00:00<00:00, 25.34it/s]


{'ner': 79.91116084106761}


100%|██████████| 13/13 [00:00<00:00, 26.44it/s]


{'ner': 83.70415910498104}


100%|██████████| 13/13 [00:00<00:00, 25.51it/s]


{'ner': 81.25746639694262}


100%|██████████| 13/13 [00:00<00:00, 25.46it/s]


{'ner': 84.87893119522768}


100%|██████████| 13/13 [00:00<00:00, 25.46it/s]


{'ner': 82.42396296390507}


100%|██████████| 13/13 [00:00<00:00, 26.21it/s]


{'ner': 84.9958556465208}


100%|██████████| 13/13 [00:00<00:00, 26.02it/s]


{'ner': 86.70391004437624}


100%|██████████| 13/13 [00:00<00:00, 26.93it/s]


{'ner': 77.75873772279101}


100%|██████████| 13/13 [00:00<00:00, 26.06it/s]


{'ner': 85.10526457518384}


100%|██████████| 13/13 [00:00<00:00, 25.44it/s]


{'ner': 80.10487183212172}


100%|██████████| 13/13 [00:00<00:00, 26.82it/s]


{'ner': 84.37947011608048}


100%|██████████| 13/13 [00:00<00:00, 26.55it/s]


{'ner': 87.38537083682604}


100%|██████████| 13/13 [00:00<00:00, 26.16it/s]


{'ner': 83.42743369477284}


100%|██████████| 13/13 [00:00<00:00, 27.12it/s]


{'ner': 88.26081335909839}


100%|██████████| 13/13 [00:00<00:00, 25.32it/s]


{'ner': 82.9187949019929}


100%|██████████| 13/13 [00:00<00:00, 26.63it/s]


{'ner': 75.78770194643084}


100%|██████████| 13/13 [00:00<00:00, 25.37it/s]


{'ner': 78.47457245911937}


100%|██████████| 13/13 [00:00<00:00, 26.51it/s]


{'ner': 87.33257378148846}


100%|██████████| 13/13 [00:00<00:00, 26.49it/s]


{'ner': 83.3229137705639}


100%|██████████| 13/13 [00:00<00:00, 26.26it/s]


{'ner': 73.13574529564357}


100%|██████████| 13/13 [00:00<00:00, 26.52it/s]


{'ner': 91.67987016725965}


100%|██████████| 13/13 [00:00<00:00, 26.65it/s]


{'ner': 88.38329426879727}


100%|██████████| 13/13 [00:00<00:00, 26.11it/s]


{'ner': 86.45346040621962}


100%|██████████| 13/13 [00:00<00:00, 27.08it/s]


{'ner': 83.52556407903467}


100%|██████████| 13/13 [00:00<00:00, 26.22it/s]


{'ner': 92.27830717146935}


100%|██████████| 13/13 [00:00<00:00, 26.84it/s]


{'ner': 82.65839994921636}


100%|██████████| 13/13 [00:00<00:00, 25.62it/s]


{'ner': 84.25613163422669}


100%|██████████| 13/13 [00:00<00:00, 27.48it/s]


{'ner': 84.26827491877339}


100%|██████████| 13/13 [00:00<00:00, 25.60it/s]


{'ner': 77.81018926134857}


100%|██████████| 13/13 [00:00<00:00, 27.02it/s]


{'ner': 78.38676336212879}


100%|██████████| 13/13 [00:00<00:00, 26.00it/s]


{'ner': 73.71670542343209}


100%|██████████| 13/13 [00:00<00:00, 26.34it/s]


{'ner': 83.32793729843968}


100%|██████████| 13/13 [00:00<00:00, 25.44it/s]


{'ner': 77.8582295155195}


100%|██████████| 13/13 [00:00<00:00, 26.49it/s]


{'ner': 86.88300600743241}


100%|██████████| 13/13 [00:00<00:00, 26.14it/s]


{'ner': 72.10113226413435}


100%|██████████| 13/13 [00:00<00:00, 26.26it/s]


{'ner': 93.52666945142232}


100%|██████████| 13/13 [00:00<00:00, 25.50it/s]


{'ner': 78.56986949464772}


100%|██████████| 13/13 [00:00<00:00, 26.18it/s]


{'ner': 71.34272157576447}


100%|██████████| 13/13 [00:00<00:00, 25.61it/s]


{'ner': 83.05926676223135}


100%|██████████| 13/13 [00:00<00:00, 26.13it/s]


{'ner': 77.01972896450752}


100%|██████████| 13/13 [00:00<00:00, 25.53it/s]


{'ner': 80.74799029320911}


100%|██████████| 13/13 [00:00<00:00, 26.96it/s]


{'ner': 78.93659094387249}


100%|██████████| 13/13 [00:00<00:00, 25.72it/s]


{'ner': 83.71933137069391}


100%|██████████| 13/13 [00:00<00:00, 26.45it/s]


{'ner': 76.5035617675021}


100%|██████████| 13/13 [00:00<00:00, 25.51it/s]


{'ner': 69.57218544800708}


100%|██████████| 13/13 [00:00<00:00, 26.03it/s]


{'ner': 90.26517775812408}


100%|██████████| 13/13 [00:00<00:00, 25.70it/s]

{'ner': 69.26930466550402}





In [None]:
for text, _ in train_dataset:
  doc = nlp(text)
  print('entities', [(i.text, i.label_) for i in doc.ents])

entities [('check', 'ACTIVITY'), ('savings account', 'PRODUCT')]
entities [('payment', 'ACTIVITY'), ('card', 'PRODUCT')]
entities [('open', 'ACTIVITY'), ('loan account', 'PRODUCT')]
entities [('savings account', 'PRODUCT')]
entities [('credit card', 'PRODUCT')]
entities [('loan account', 'PRODUCT'), ('not approved', 'ACTIVITY')]
entities [('open', 'ACTIVITY'), ('loan amount', 'PRODUCT')]
entities [('Money transfer', 'ACTIVITY'), ('savings account', 'PRODUCT')]
entities [('fraud', 'ACTIVITY')]
entities [('credit card', 'PRODUCT')]
entities [('payment', 'ACTIVITY'), ('credit card', 'PRODUCT')]
entities [('Investment account', 'PRODUCT')]
entities [('mortgage', 'PRODUCT'), ('delinquent', 'ACTIVITY')]


In [None]:
doc = nlp('I want to open a demat account with DCB bank in Kolkata')
displacy.render(doc, style='ent', jupyter=True)