<a href="https://colab.research.google.com/github/ashuxldr/ECI_NER_PROJECT/blob/main/NERmodel_prodigy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

MOUNTING GOOGLE DRIVE WITH COLAB

In [None]:
from google.colab import drive
drive.mount('/content/drive')

INSTALLING AND IMPORTING PACKAGES

In [None]:
!pip install spacy

In [None]:
!pip install pandas

In [None]:
!pip install nlpaug

In [None]:
import spacy
from collections import Counter
from pandas import *


TRAIN MODEL USING ANNOTATED DATA FROM PRODIGY

In [None]:
!python -m spacy train drive/MyDrive/ECI/config.cfg --output ./ --paths.train drive/MyDrive/ECI/train.spacy --paths.dev drive/MyDrive/ECI/dev.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-05-19 20:59:37,306] [INFO] Set up nlp object from config
[2023-05-19 20:59:37,327] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-05-19 20:59:37,335] [INFO] Created vocabulary
[2023-05-19 20:59:37,337] [INFO] Finished initializing nlp object
[2023-05-19 20:59:43,112] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     48.09    0.00    0.00    0.00    0.00
  0     200        283.14   1437.03   58.88   61.78   56.23    0.59
  1     400       1270.33    671.54   64.80   70.03   60.29    0.65
  2     600         76.21    455.11   73.68   79.07   68.99    0.74
  3     800        309.71    529.08   73.55   73.76   73.33    

In [None]:
ls

[0m[01;34mdrive[0m/  [01;34mmodel-best[0m/  [01;34mmodel-last[0m/  [01;34msample_data[0m/


LOADING THE TRAINED MODEL AND ADDING RULE BASED MATCHING

In [None]:
# LOADING THE TRAINING MODEL
nlp1 = spacy.load(r"./model-best")  

# ADDING ENTITY RULER PIPE TO OUR MODEL
ruler = nlp1.add_pipe("entity_ruler")

ADDING THE PHRASE AND TOKEN PATTERNS FOR RULE BASED MATCHING FOR STATE, ASSEMBLY AND PARLIAMENTARY CONSTITUENCIES

In [None]:
# READING PC DATA AND ADDING PATTERNS TO MODEL
data1 = read_csv("drive/MyDrive/ECI/parliamentary_constituency.csv")
PC = data1['Parliamentary Constituency'].tolist()
pc_patterns = []
for i in PC:
  index = i.find(' ')
  if index == -1:
    dict1 = {"label":"PARLIAMENTARY", "pattern":i}
  else:
    dict1 = {"label":"PARLIAMENTARY", "pattern":[ {'LOWER':x} for x in i.split(' ')]}
  pc_patterns.append(dict1)
ruler.add_patterns(pc_patterns)

In [None]:
# READING STATE DATA AND ADDING PATTERNS TO MODEL
data3 = read_csv("drive/MyDrive/ECI/State.csv")
states = data3['State'].tolist()
state_patterns = []
for i in states:
  index = i.find(' ')
  if index == -1:
    dict1 = {"label":"STATE", "pattern":i}
  else:
    dict1 = {"label":"STATE", "pattern":[ {'LOWER':x} for x in i.split(' ')]}
  state_patterns.append(dict1)
ruler.add_patterns(state_patterns)

In [None]:
# READING ASSEMBLY CONSTITUENCIES AND ADDING THEIR PATTERNS
data = read_csv("drive/MyDrive/ECI/assembly.csv")
AC = data['AC_NAME'].tolist()
AC1 = []
for i in AC:
  x = i 
  if i.endswith("(ST)") or i.endswith("(SC)"):
    x = i[:i.index('(')]
  AC1.append(x)

ac_patterns = []
for i in AC1:
  index = i.find(' ')
  if index == -1:
    dict1 = {"label":"ASSEMBLY", "pattern":i}
  else:
    dict1 = {"label":"ASSEMBLY", "pattern":[ {'LOWER':x} for x in i.split(' ')]}
  ac_patterns.append(dict1)

ruler.add_patterns(ac_patterns)

READING INPUT TEXT AND PRINTING ANALYZED ENTITIES

In [None]:
doc = nlp1("""
Amritsar, Punjab: Amritsar holds immense religious importance as the home of the Golden Temple, making it a significant parliamentary constituency in Punjab.
Pune, Maharashtra: Pune, known for its educational institutions and IT industry, represents the modern, cosmopolitan character of Maharashtra.
""")
print(doc.ents)

(Amritsar, Punjab, Amritsar, Punjab, Pune, Maharashtra, Pune, Maharashtra)


OUTPUT - TAGGED ENTITES AS STATE, DISTRICT, ASSEMBLY, PARLIAMENTARY

In [None]:
from spacy import displacy
colors = {'STATE': "#fd7e14", 'DISTRICT': "#007bff", 'ASSEMBLY': "#dc3545", 'PARLIAMENTARY': "#28a745"}
options = {"ents": ['STATE', 'DISTRICT', 'ASSEMBLY', 'PARLIAMENTARY'], "colors": colors} 
spacy.displacy.render(doc, style="ent", jupyter=True, options=options)

CODE FOR DATA  AUGMENTATION

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac

# Text data for augmentation
text_data = [
    "Uttar Pradesh, a politically significant state in India, encompasses parliamentary constituencies such as Gorakhpur, Lucknow, Varanasi, Firozabad, and Meerut.",
    "Gorakhpur, known for its historical and religious importance, has been a stronghold of the Bharatiya Janata Party (BJP) and was represented by Yogi Adityanath, the current Chief Minister of the state.", 
"Lucknow, the capital city, holds prestige as a parliamentary constituency and has seen notable politicians like former Prime Minister Atal Bihari Vajpayee and the current Defense Minister, Rajnath Singh, representing it.",
" Varanasi, a city of immense cultural significance, stands out as the constituency of the Prime Minister, Narendra Modi, who has initiated various developmental projects in the area. Firozabad and Meerut constituencies, with their unique socio-economic landscapes, contribute to the diverse political fabric of Uttar Pradesh.",
 " These parliamentary constituencies represent the aspirations, concerns, and diverse demographics of the people, making Uttar Pradesh a crucial state in shaping India's political landscape.",
]

# Word-level augmentation
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
augmented_data = aug.augment(text_data)

print("Augmented Data (Synonym Replacement):")
for i in range(len(augmented_data)):
    print(f"{text_data[i]} => {augmented_data[i]}")

# Character-level augmentation
aug = nac.KeyboardAug()
augmented_data = aug.augment(text_data)

print("\nAugmented Data (Keyboard Typos):")
for i in range(len(augmented_data)):
    print(f"{text_data[i]} => {augmented_data[i]}")

Augmented Data (Synonym Replacement):
Uttar Pradesh, a politically significant state in India, encompasses parliamentary constituencies such as Gorakhpur, Lucknow, Varanasi, Firozabad, and Meerut. => Uttar Pradesh, a politically substantial state of matter in Republic of india, encompasses parliamentary constituency such as Gorakhpur, Lucknow, Varanasi, Firozabad, and Meerut.
Gorakhpur, known for its historical and religious importance, has been a stronghold of the Bharatiya Janata Party (BJP) and was represented by Yogi Adityanath, the current Chief Minister of the state. => Gorakhpur, known for its historical and spiritual importance, suffer been a fastness of the Bharatiya Janata Party (BJP) and was represented by Lawrence peter berra Adityanath, the current Tribal chief Minister of the state.
Lucknow, the capital city, holds prestige as a parliamentary constituency and has seen notable politicians like former Prime Minister Atal Bihari Vajpayee and the current Defense Minister, Raj