In [None]:
## In this notebook i try to build POS tagging and NER (Name entity recognision)

In [1]:
import spacy

In [2]:
nlp_tagg = spacy.load("en_core_web_sm")

In [3]:
text = """Background and context
In 1636, the Adil Shahi sultanate of Bijapur invaded the kingdoms to its south.[6] The sultanate had recently become a tributary state of the Mughal empire.[6][32] It was being helped by Shahaji, who at the time was a chieftain in the Maratha uplands of western India. Shahaji was looking for opportunities of rewards of jagir land in the conquered territories, the taxes on which he could collect as an annuity.[6]

Shahaji was a rebel from brief Mughal service. Shahaji's campaigns against the Mughals, supported by the Bijapur government, were generally unsuccessful. He was constantly pursued by the Mughal army, and Shivaji and his mother Jijabai had to move from fort to fort.[33]


Young Shivaji (right) meets his father Shahaji. (left)
In 1636, Shahaji joined in the service of Bijapur and obtained Poona as a grant. Shahaji, being deployed in Bangalore by the Bijapuri ruler Adilshah, appointed Dadoji Kondadeo as Poona's administrator. Shivaji and Jijabai settled in Poona.[34] Kondadeo died in 1647 and Shivaji took over its administration. One of his first acts directly challenged the Bijapuri government."""

In [4]:
doc = nlp_tagg(text)

In [None]:
for word in doc:
  print(word , "==>",word.pos_,"==>",spacy.explain(word.pos_))

Background ==> NOUN ==> noun
and ==> CCONJ ==> coordinating conjunction
context ==> NOUN ==> noun

 ==> SPACE ==> space
In ==> ADP ==> adposition
1636 ==> NUM ==> numeral
, ==> PUNCT ==> punctuation
the ==> DET ==> determiner
Adil ==> PROPN ==> proper noun
Shahi ==> PROPN ==> proper noun
sultanate ==> NOUN ==> noun
of ==> ADP ==> adposition
Bijapur ==> PROPN ==> proper noun
invaded ==> VERB ==> verb
the ==> DET ==> determiner
kingdoms ==> NOUN ==> noun
to ==> ADP ==> adposition
its ==> PRON ==> pronoun
south.[6 ==> NUM ==> numeral
] ==> PUNCT ==> punctuation
The ==> DET ==> determiner
sultanate ==> NOUN ==> noun
had ==> AUX ==> auxiliary
recently ==> ADV ==> adverb
become ==> VERB ==> verb
a ==> DET ==> determiner
tributary ==> NOUN ==> noun
state ==> NOUN ==> noun
of ==> ADP ==> adposition
the ==> DET ==> determiner
Mughal ==> PROPN ==> proper noun
empire.[6][32 ==> PROPN ==> proper noun
] ==> PUNCT ==> punctuation
It ==> PRON ==> pronoun
was ==> AUX ==> auxiliary
being ==> AUX ==> au

In [None]:
for tagg in nlp_tagg.get_pipe("tagger").labels:
  print(tagg ,"==>",spacy.explain(tagg))

$ ==> symbol, currency
'' ==> closing quotation mark
, ==> punctuation mark, comma
-LRB- ==> left round bracket
-RRB- ==> right round bracket
. ==> punctuation mark, sentence closer
: ==> punctuation mark, colon or ellipsis
ADD ==> email
AFX ==> affix
CC ==> conjunction, coordinating
CD ==> cardinal number
DT ==> determiner
EX ==> existential there
FW ==> foreign word
HYPH ==> punctuation mark, hyphen
IN ==> conjunction, subordinating or preposition
JJ ==> adjective (English), other noun-modifier (Chinese)
JJR ==> adjective, comparative
JJS ==> adjective, superlative
LS ==> list item marker
MD ==> verb, modal auxiliary
NFP ==> superfluous punctuation
NN ==> noun, singular or mass
NNP ==> noun, proper singular
NNPS ==> noun, proper plural
NNS ==> noun, plural
PDT ==> predeterminer
POS ==> possessive ending
PRP ==> pronoun, personal
PRP$ ==> pronoun, possessive
RB ==> adverb
RBR ==> adverb, comparative
RBS ==> adverb, superlative
RP ==> adverb, particle
SYM ==> symbol
TO ==> infinitival 

## NER

In [None]:
ner = spacy.load("en_core_web_sm")

In [None]:
doc_ner = ner(text)

In [None]:
doc_ner.ents[5].label_

'LOC'

In [None]:
for word in doc_ner.ents:
  print(word.text , "==>",word.label_)

1636 ==> DATE
Adil Shahi ==> ORG
Bijapur ==> GPE
the Mughal empire.[6][32] ==> ORG
Shahaji ==> ORG
Maratha ==> LOC
India ==> GPE
Mughal ==> PERSON
Shahaji ==> ORG
Mughals ==> ORG
Mughal ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
fort ==> GPE
Young Shivaji ==> PERSON
Shahaji ==> PERSON
1636 ==> DATE
Shahaji ==> ORG
Bijapur ==> GPE
Poona ==> PERSON
Bijapuri ==> LOC
Adilshah ==> PERSON
Dadoji Kondadeo ==> PERSON
Poona ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
1647 ==> DATE
Shivaji ==> PERSON
One ==> CARDINAL
first ==> ORDINAL
Bijapuri ==> LOC


In [None]:
from spacy import displacy


In [None]:
displacy.render(doc_ner.ents,style="ent",jupyter=True)

# Added custom word (Dictionary based)

In [None]:
from spacy.tokens import Span

In [None]:
doc_ner[8:9]

Adil

In [None]:
Span_= Span(doc_ner, 8, 9, label = "Name")

In [None]:
doc_ner.set_ents([Span_],default='unmodified')

In [None]:
for ents in doc_ner.ents:
  print(ents.text ,"==>" ,ents.label_)

1636 ==> DATE
Adil ==> Name
Shahi ==> ORG
Bijapur ==> GPE
the Mughal empire.[6][32] ==> ORG
Shahaji ==> ORG
Maratha ==> LOC
India ==> GPE
Mughal ==> PERSON
Shahaji ==> ORG
Mughals ==> ORG
Mughal ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
fort ==> GPE
Young Shivaji ==> PERSON
Shahaji ==> PERSON
1636 ==> DATE
Shahaji ==> ORG
Bijapur ==> GPE
Poona ==> PERSON
Bijapuri ==> LOC
Adilshah ==> PERSON
Dadoji Kondadeo ==> PERSON
Poona ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
1647 ==> DATE
Shivaji ==> PERSON
One ==> CARDINAL
first ==> ORDINAL
Bijapuri ==> LOC


# Custom word added in NER (Rule Based)

In [None]:
from spacy.pipeline import EntityRuler

In [None]:
pattern = [{"label":"PHONE","pattern":"1636"}]

In [None]:
ruler = ner.add_pipe("entity_ruler",before="ner")

In [None]:
ruler.add_patterns(pattern)

In [None]:
for ents in doc_ner.ents:
  print(ents.text ,"==>" ,ents.label_)

1636 ==> DATE
Adil ==> Name
Shahi ==> ORG
Bijapur ==> GPE
the Mughal empire.[6][32] ==> ORG
Shahaji ==> ORG
Maratha ==> LOC
India ==> GPE
Mughal ==> PERSON
Shahaji ==> ORG
Mughals ==> ORG
Mughal ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
fort ==> GPE
Young Shivaji ==> PERSON
Shahaji ==> PERSON
1636 ==> DATE
Shahaji ==> ORG
Bijapur ==> GPE
Poona ==> PERSON
Bijapuri ==> LOC
Adilshah ==> PERSON
Dadoji Kondadeo ==> PERSON
Poona ==> PERSON
Shivaji ==> PERSON
Jijabai ==> GPE
1647 ==> DATE
Shivaji ==> PERSON
One ==> CARDINAL
first ==> ORDINAL
Bijapuri ==> LOC


#Fine tunning

In [1]:
## Labeling

##prodi.g

In [5]:
import spacy
from spacy.tokens import DocBin

In [6]:
nlp = spacy.blank("en")
db = DocBin()



In [7]:
import json
file = open("/content/annotations.json")
train_data = json.load(file)

In [8]:
train_data

{'classes': ['MAHARAJ', 'VARSH', 'DUSHMAN', 'RAJMATA'],
 'annotations': [['Shivaji was the founder of the Maratha Empire. He was born in the Shivneri Fort in Maharashtra probably on 19 February 1630. He is named after a local goddess, Shivai Devi.\r\n\r\nShivaji is one of the revered historical figures of Maharashtra. He created an independent and sovereign state in the Maharashtra region. His mother, Raajmaata Jijabai was the daughter of Shri.Lakhuji Jadhavrao of Sindkhed. His father Shri.Shahajiraje Bhosale was a Maratha general in the Deccan.\r\n\r\nMost of the territory in Maharashtra was then under the possession of the Nizamshah of Ahmednagar and the Adilshah of Bijapur who were known as the Deccan sultanates. The Mughals launched a campaign to conquer the Nizamshahi Kingdom. The Adilshah of Bijapur allied with the Mughals in this campaign. Shri. Shahajiraje Bhosale tried to rebel, but he could not withstand the combined might of the Mughals and the Adilshahi. The Nizamshahi king

In [9]:
for text ,annotation in train_data["annotations"]:
  doc = nlp.make_doc(text)
  entity = []
  for start , end , label in annotation["entities"]:
    span = doc.char_span(start , end ,label,alignment_mode='contract')
    if span is None:
      print("none")
    else:
      entity.append(span)
    doc.ents = entity
    db.add(doc)

  db.to_disk("training_data.spacy")

In [10]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    392.11    4.49    4.65    4.34    0.04
  4     200        322.29  11440.21   70.10   76.56   64.65    0.70
  9     400        289.47   4005.92   75.53   65.18   89.80    0.76
 13     600        270.42   3802.82   67.27   84.09   56.06    0.67
 18     800        242.65   3666.12   76.21   65.76   90.61    0.76
 22    1000        326.75   3822.18   71.38   80.30   64.24    0.71
 27    1200        223.87   3626.16   75.82   75.00   76.67    0.76
 31    1400        233.03   3544.33   73.58   78.18   69.49    0.74
 36    1600        231.31   3472.21   76.56   72.73   80.

In [12]:
# Make predictios from above

In [15]:
train_ner = spacy.load("/content/model-best")



In [19]:
text ="""Shivaji was the founder of the Maratha Empire. He was born in the Shivneri Fort in Maharashtra probably on 19 February 1630. He is named after a local goddess, Shivai Devi.

Shivaji is one of the revered historical figures of Maharashtra. He created an independent and sovereign state in the Maharashtra region. His mother, Raajmaata Jijabai was the daughter of Shri.Lakhuji Jadhavrao of Sindkhed. His father Shri.Shahajiraje Bhosale was a Maratha general in the Deccan.

Most of the territory in Maharashtra was then under the possession of the Nizamshah of Ahmednagar and the Adilshah of Bijapur who were known as the Deccan sultanates. The Mughals launched a campaign to conquer the Nizamshahi Kingdom. The Adilshah of Bijapur allied with the Mughals in this campaign. Shri. Shahajiraje Bhosale tried to rebel, but he could not withstand the combined might of the Mughals and the Adilshahi. The Nizamshahi kingdom came to an end in 1636. Thereafter Shri.Shahajiraje became a Sardar of the Adilshah of Bijapur and was posted in Karnataka. The region comprising Pune, Supe, Indapur, and Chakan Parganas located between the Bhima and Nira rivers which were vested in Shri. Shahdara as a jagir was continued by the Adilshah. Shri. Shahdara was also given a jagir of Bangalore. Rajmata Jijabai and Chh. Shivaji , stayed for a few years with Shahaji Raje at Bangalore until Shivaji I was twelve years old. Shri. Shahaji entrusted the administration of the Lakshmi jagir to Shivaji and Mata Jijabai.
Shivaji grew up amidst the hills and valleys of the Pune region under the guidance of his mother Raajmata Jijabai. His mother taught him the stories of the Hindu epics and scriptures such as Mahabharata and also stories of old mighty Hindu kingdoms such as the Vijayanagara. He was also trained in guerrilla warfare. He defeated the Mughal forces of Rai Bagan.
Foundation of the Maratha kingdom

Several small spurs run eastward from the sahyadri in the Pune region. The extremely rugged valleys enclosed by these are usually known as the Mavals or Khores each named after the stream running through it, or after the principal village. Collectively they are known as the Mavals. The inhabitants of this region who are called the Mavalas were extremely hardy people. Shivaji started the work of founding the Swaraj - self-rule - in this region. He used the geographical features of the Maval region. He created a feeling of trust and affection in minds of the people. Many associates, companions, and Mavalas joined him in his work of founding Swaraj. The objective of Shivaji in founding the Swaraj is clearly expressed in his official seal or Mudra which is in Sanskrit. Through this Mudra, Shivaji Maharaj assured his people that ‘ever-increasing like the crescent moon, the kingdom of Shivaji, son of Shahaji, will always seek the welfare of the people.
The Capture of Javali.

The region of Javali in the Satara district was important. Many routes to the Konkan were through Javali. For the expansion of the Swaraj in the Konkan, it was essential to control that region. The region of Javali was ruled by Chandrarao More, a powerful Sardar in the Adilshahi. Shivaji attacked Javali and captured it in A.D. 1656. Then he also captured Rairi. This strong fort, later under the name of Raigad, was to become his capital. He built the Pratapgad fort in the Javali valley to protect the newly conquered territory and to control the Paar pass. The victory at Javali led to the expansion of the Swaraj in Konkan. Shivaji then crossed the Ghats and descended into Konkan. He captured Kalyan and Bhivandi on the Konkan coast which were under the control of Adilshahi. Shivaji also captured forts like Mahuli, Lohagad, Tunga, Tikona, Visapur, Songad, Karnala, Tala and Ghosala, in the Konkan. Shivaji was able to command the coast line because of his acquisition of this territory in the Konkan. He came in contact with the Portuguese, the British and the Siddi powers on the western coast. The Siddi controlled the fort of Janjira and the areas around, including Danda-Rajpuri. Wherever in future these powers created obstacles in the work of expanding the Swaraj, Shivaji tried to curb their activities.
The Establishment of Maratha Navy"""

In [20]:
doc = train_ner(text)

In [21]:
for ents in doc.ents:
  print(ents.text ,"==>" ,ents.label_)

Shivaji ==> MAHARAJ
Shivaji ==> MAHARAJ
Raajmaata Jijabai ==> RAJMATA
Shri.Shahajiraje Bhosale ==> MAHARAJ
Nizamshah ==> DUSHMAN
Adilshah ==> DUSHMAN
Mughals ==> DUSHMAN
Nizamshahi ==> DUSHMAN
Adilshah ==> DUSHMAN
Mughals ==> DUSHMAN
Shri. Shahajiraje ==> MAHARAJ
Mughals ==> DUSHMAN
Adilshahi. ==> DUSHMAN
Nizamshahi kingdom ==> DUSHMAN
1636. ==> VARSH
Shri.Shahajiraje ==> MAHARAJ
Adilshah ==> DUSHMAN
Adilshah. ==> DUSHMAN
Rajmata Jijabai ==> RAJMATA
Chh. Shivaji ==> MAHARAJ
Shahaji Raje ==> MAHARAJ
Shivaji ==> MAHARAJ
Shri. Shahaji ==> MAHARAJ
Shivaji ==> MAHARAJ
Mata Jijabai. ==> RAJMATA
Raajmata Jijabai. ==> RAJMATA
Mughal ==> DUSHMAN


In [24]:
from transformers import pipeline

In [25]:
ner_model = pipeline(model = "dslim/bert-base-NER-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [26]:
ner_model(text)

[{'entity': 'B-PER',
  'score': 0.9824008,
  'index': 1,
  'word': 'shiva',
  'start': 0,
  'end': 5},
 {'entity': 'I-PER',
  'score': 0.78100306,
  'index': 2,
  'word': '##ji',
  'start': 5,
  'end': 7},
 {'entity': 'B-MISC',
  'score': 0.8752444,
  'index': 8,
  'word': 'mara',
  'start': 31,
  'end': 35},
 {'entity': 'I-MISC',
  'score': 0.9069552,
  'index': 9,
  'word': '##tha',
  'start': 35,
  'end': 38},
 {'entity': 'B-LOC',
  'score': 0.98736435,
  'index': 17,
  'word': 'shi',
  'start': 66,
  'end': 69},
 {'entity': 'I-LOC',
  'score': 0.9269225,
  'index': 18,
  'word': '##vn',
  'start': 69,
  'end': 71},
 {'entity': 'I-LOC',
  'score': 0.98827666,
  'index': 19,
  'word': '##eri',
  'start': 71,
  'end': 74},
 {'entity': 'B-LOC',
  'score': 0.99621946,
  'index': 22,
  'word': 'maharashtra',
  'start': 83,
  'end': 94},
 {'entity': 'B-PER',
  'score': 0.903849,
  'index': 37,
  'word': 'shiva',
  'start': 160,
  'end': 165},
 {'entity': 'I-PER',
  'score': 0.45400006,
  