In [None]:
from google.colab import userdata
key = userdata.get('GITHUB_TOKEN_KEY')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Internship Tasks

/content/drive/MyDrive/Internship Tasks


In [5]:
!git clone https://github.com/Zayaad-Wajid/Named-Entity-Recognition-NER-from-News-Articles.git

Cloning into 'Named-Entity-Recognition-NER-from-News-Articles'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [6]:
!git config --global user.email "zayaadw@example.com"
!git config --global user.name "Zayaad-Wajid"

In [9]:
%cd /content/drive/MyDrive/Internship Tasks/Named-Entity-Recognition-NER-from-News-Articles

/content/drive/MyDrive/Internship Tasks/Named-Entity-Recognition-NER-from-News-Articles


In [10]:
!git add .

In [12]:
!git commit -m "Initial Commits"

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [13]:
!pwd

/content/drive/MyDrive/Internship Tasks/Named-Entity-Recognition-NER-from-News-Articles


In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.txt to test (2).txt
Saving train.txt to train (2).txt
Saving valid.txt to valid (2).txt


In [None]:
def load_conll_data(file_path):
  sentences = []
  sentence = []

  with open(file_path, "r", encoding = "utf-8") as f:
    for line in f:
      if line.strip() == "":
        sentences.append(sentence)
        sentence = []

      else:
        parts = line.strip().split()
        if len(parts) == 4:
          word, pos, chunk, ner = parts
          sentence.append((word, ner))

  if sentence:
    sentences.append(sentence)

  return sentences


In [None]:
train_data = load_conll_data("train.txt")
test_data = load_conll_data("test.txt")
valid_data = load_conll_data("valid.txt")

In [None]:
print("Sample Sentence: ")
print(train_data[0])
print(train_data[3])

Sample Sentence: 
[('-DOCSTART-', 'O')]
[('BRUSSELS', 'B-LOC'), ('1996-08-22', 'O')]


In [None]:
def identify_entities(sentence):
  entities = []
  current_entity = []
  current_label = None

  for word, tag in sentence:
    if tag.startswith("B-"):
      if current_entity:
        entities.append((" ".join(current_entity), current_label))

      current_entity = [word]
      current_label = tag[2:]

    elif tag.startswith("I-"):
      current_entity.append(word)

    else:
      if current_entity:
        entities.append((" ".join(current_entity), current_label))
        current_entity = []
        current_label = None

  if current_entity:
    entities.append((" ".join(current_entity), current_label))


  return entities

In [None]:
for i in range (5):
  print(f"Sentence {i+1} Entities: ")
  print(identify_entities(train_data[i]))

Sentence 1 Entities: 
[]
Sentence 2 Entities: 
[('EU', 'ORG'), ('German', 'MISC'), ('British', 'MISC')]
Sentence 3 Entities: 
[('Peter Blackburn', 'PER')]
Sentence 4 Entities: 
[('BRUSSELS', 'LOC')]
Sentence 5 Entities: 
[('European Commission', 'ORG'), ('German', 'MISC'), ('British', 'MISC')]


**Rule-Based NER using spaCy**

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = " ".join([word for word, tag in train_data[20]])
doc = nlp(text)

print("Rule-Based Ner using spacy: ")
for ent in doc.ents:
  print(ent.text, ent.label_)

Rule-Based Ner using spacy: 
4,275 tonnes QUANTITY
British NORP
some 10 percent PERCENT


**Model-Based NER using BERT**

In [None]:
!pip install transformers



In [None]:
from transformers import pipeline

ner_pipe = pipeline("ner", model = "dslim/bert-base-NER", aggregation_strategy = "simple")

text = " ".join([word for word, tag in test_data[1]])
entities = ner_pipe(text)

print("Model_Based NER using BERT: ")
for ent in entities:
  print(ent['word'], "→", ent['entity_group'], f"({ent['score']:.2f})")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Model_Based NER using BERT: 
J → MISC (0.48)
##AP → LOC (0.29)
L → PER (0.43)
##UC → LOC (0.42)
CH → ORG (0.64)
##IN → LOC (0.51)
##A → ORG (0.59)


In [None]:
from termcolor import colored

def highlight_entities(text, ner_results):
  for ent in ner_results:
    word = ent['word']
    label = ent['entity_group']
    color = "cyan" if label == "PER" else "green" if label == "ORG" else "magenta"
    text = text.replace(word, colored(word, color) + f"[{label}]")
  return text

print(highlight_entities(text, entities))

SOCCER - J[MISC]APAN GET L[PER]UCKY WIN , CH[ORG]INA IN SURPRISE DEFEAT .


# Bonus Tasks

In [None]:
from spacy import displacy

displacy.render(doc, style = "ent", jupyter = True)

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_trf  = spacy.load("en_core_web_trf")

text = " ".join([word for word, tag in test_data[15]])

sm = nlp_sm(text)
trf = nlp_trf(text)

print("Spacy sm: ")
displacy.render(sm, style = "ent", jupyter = True)

print("Spacy trf: ")
displacy.render(trf, style = "ent", jupyter = True)

Spacy sm: 


Spacy trf: 


#Comparing both models with multiple random sentences

In [None]:
import random

rdm_txt = random.sample(train_data, 3)

texts = [" ".join([word for word, tag in sentence]) for sentence in rdm_txt]

print(texts)

['5. Mika Hakkinen ( Finland ) 23', 'IHC also forecast post tax earnings rising 21 percent for the full year .', 'Gente said Ducruet , a keen racing driver , met Houteman during a race in Belgium and photographers had been on their trail ever since .']


In [None]:
for i, text in enumerate(texts):
  print(f"Sentence {i + 1}:")

  sm = nlp_sm(text)
  trf  = nlp_trf(text)

  print("Spacy sm: ")
  displacy.render(sm, style = "ent", jupyter = True)

  print("Spacy trf: ")
  displacy.render(trf, style = "ent", jupyter = True)

Sentence 1:
Spacy sm: 


Spacy trf: 


Sentence 2:
Spacy sm: 


Spacy trf: 


Sentence 3:
Spacy sm: 


Spacy trf: 
