# Imports

In [122]:
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [179]:
import spacy
from spacy import displacy
from spacy import tokenizer
import tqdm

# Data Cleaning

In [134]:
def read_file(path):
    text_file = open(path, "r")
    data = text_file.read()
    text_file.close()
    return data

def write_file(path, data):
    text_file = open(path, "w")
    n = text_file.write(data)
    text_file.close()

def clean_data(data):
    soup = BeautifulSoup(data, "html.parser")
    for txt in soup(['style', 'script']):
        txt.decompose()
    return ' '.join(soup.stripped_strings)

def get_sentences(data):
  # parsing the html file
  htmlParse = BeautifulSoup(data, 'html.parser')
  sentences = []
    
  # getting all the paragraphs
  for para in htmlParse.find_all("p"):
      para_txt = para.get_text()
      sentences_para = nltk.sent_tokenize(para_txt)
      for sent in sentences_para:
        sentences.append(sent.replace('\n', ' '))

  return sentences

def get_labels(data):
  htmlParse = BeautifulSoup(data, 'html.parser')
  labels = {'locations':[], 'organizations':[], 'persons':[]}

  for i in htmlParse.findAll(attrs={'type' : 'LOCATION'}):
      labels['locations'].append(i.text.replace('\n', ' '))

  for i in htmlParse.findAll(attrs={'type' : 'ORGANIZATION'}):
      labels['organizations'].append(i.text.replace('\n', ' '))

  for i in htmlParse.findAll(attrs={'type' : 'PERSON'}):
      labels['persons'].append(i.text.replace('\n', ' '))
  print("Entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(labels['locations']), len(labels['organizations']), len(labels['persons'])))

  print("\nUnique words entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(set(labels['locations'])), len(set(labels['organizations'])), len(set(labels['persons']))))

  return labels

In [65]:
words = nltk.word_tokenize(sentences[1])
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=False)

In [None]:
words

In [None]:
for chunk in namedEnt:
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

In [69]:
path_read = r'/content/news_sample_ner.txt'
path_write = r'/content/news_sample_ner_cleaned.txt'

In [135]:
data = read_file(path_read)

In [None]:
cleaned_data = clean_data(data)
print(cleaned_data)
write_file(path_write, cleaned_data)

In [137]:
sentences = get_sentences(data)

In [262]:
labels = get_labels(data)

Entities distribution:
Locations: 167
Organizations: 173
Persons: 93

Unique words entities distribution:
Locations: 81
Organizations: 70
Persons: 64


In [146]:
nlp = spacy.load('en_core_web_sm')

In [164]:
preds_spacy = {'locations':[], 'organizations':[], 'persons':[]} 

In [None]:
for sentence in sentences:
  doc = nlp(sentence)
  ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
  for ent in ents:
    print(ent[-1])
    if ent[-1] == 'LOC' or ent[-1] == 'GPE':
      preds_spacy['locations'].append(ent[0])
    elif ent[-1] == 'ORG':
      preds_spacy['organizations'].append(ent[0])
    elif ent[-1] == 'PERSON':
      preds_spacy['persons'].append(ent[0])

In [None]:
preds_spacy['persons']

In [171]:
doc = nlp(sentences[2])

In [172]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]

In [173]:
ents

[('Branson', 0, 7, 'GPE'),
 ('Lindstrand', 12, 22, 'GPE'),
 ('first', 74, 79, 'ORDINAL'),
 ('the Atlantic Ocean', 121, 139, 'LOC'),
 ('1987', 144, 148, 'DATE'),
 ('Pacific', 158, 165, 'LOC'),
 ('1991', 170, 174, 'DATE')]

In [163]:
ents[0][-1]

'NORP'

In [None]:
labels['locations']

# Evaluation

In [268]:
def calc_precision(preds, labels, entity):
  cnt = 0
  for pred in preds[entity]:
    if pred in labels[entity]:
      cnt+=1
  #print(cnt)
  return cnt / len(preds[entity])

In [269]:
def calc_recall(preds, labels, entity):
  cnt = 0
  for label in labels[entity]:
    if label in preds[entity]:
      cnt+=1
  #print(cnt)
  return cnt / len(labels[entity])

In [231]:
def calc_f1_score(precision, recall):
  return (2 * precision * recall) / (precision + recall)

In [232]:
def calc_metrics(preds, labels):

  person_precision = calc_precision(preds, labels, 'persons')
  #print(person_precision)
  locations_precision = calc_precision(preds, labels, 'locations')
  #print(locations_precision)
  organizations_precision = calc_precision(preds, labels, 'organizations')
  #print(organizations_precision)

  person_recall = calc_recall(preds, labels, 'persons')
  #print(person_recall)
  locations_recall = calc_recall(preds, labels, 'locations')
  #print(locations_recall)
  organizations_recall = calc_recall(preds, labels, 'organizations')
  #print(organizations_recall)

  person_f1_score = calc_f1_score(person_precision, person_recall)
  #print(person_f1_score)
  locations_f1_score = calc_f1_score(locations_precision, locations_recall)
  #print(locations_f1_score)
  organizations_f1_score = calc_f1_score(organizations_precision, organizations_recall)
  #print(organizations_f1_score)

  #print('\n')
  print('Persons:')
  print('Precision: {} \t Recall: {} \t F1-score: {}'.format(person_precision, person_recall, person_f1_score))
  print('\n')
  print('Locations:')
  print('Precision: {} \t Recall: {} \t F1-score: {}'.format(locations_precision, locations_recall, locations_f1_score))
  print('\n')
  print('Organizations:')
  print('Precision: {} \t Recall: {} \t F1-score: {}'.format(organizations_precision, organizations_recall, organizations_f1_score))

# NER with statistical models

In [233]:
def predict_spacy(sentences):
  nlp = spacy.load('en_core_web_sm')
  preds_spacy = {'locations':[], 'organizations':[], 'persons':[]}

  for sentence in sentences:
    doc = nlp(sentence)
    ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
    for ent in ents:
      #print(ent[-1])
      if ent[-1] == 'LOC' or ent[-1] == 'GPE':
        preds_spacy['locations'].append(ent[0])
      elif ent[-1] == 'ORG':
        preds_spacy['organizations'].append(ent[0])
      elif ent[-1] == 'PERSON':
        preds_spacy['persons'].append(ent[0])
  
  print("Entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(preds_spacy['locations']), len(preds_spacy['organizations']), len(preds_spacy['persons'])))

  print("\nUnique words entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(set(preds_spacy['locations'])), len(set(preds_spacy['organizations'])), len(set(preds_spacy['persons']))))
  
  return preds_spacy

In [263]:
def predict_nltk(sentences):
  preds_nltk = {'locations':[], 'organizations':[], 'persons':[]}

  for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(tagged, binary=False)

    for chunk in chunks:
      if hasattr(chunk,'label'):
          #print(chunk.label())
          if chunk.label() == 'LOCATION' or chunk.label() == 'GPE':
            preds_nltk['locations'].append(' '.join(c[0] for c in chunk))
          elif chunk.label() == 'ORGANIZATION':
            preds_nltk['organizations'].append(' '.join(c[0] for c in chunk))
          elif chunk.label() == 'PERSON':
            preds_nltk['persons'].append(' '.join(c[0] for c in chunk))
  
  print("Entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(preds_nltk['locations']), len(preds_nltk['organizations']), len(preds_nltk['persons'])))

  print("\nUnique words entities distribution:")

  print("Locations: {}\nOrganizations: {}\nPersons: {}".format(len(set(preds_nltk['locations'])), len(set(preds_nltk['organizations'])), len(set(preds_nltk['persons']))))
  
  return preds_nltk

In [234]:
preds_spacy = predict_spacy(sentences)

Entities distribution:
Locations: 1181
Organizations: 1466
Persons: 580

Unique words entities distribution:
Locations: 69
Organizations: 91
Persons: 52


In [235]:
calc_metrics(preds_spacy, labels)

Persons:
Precision: 0.7724137931034483 	 Recall: 0.5698924731182796 	 F1-score: 0.6558753660018783


Locations:
Precision: 0.8196443691786621 	 Recall: 0.8023952095808383 	 F1-score: 0.8109280735206776


Organizations:
Precision: 0.5381991814461119 	 Recall: 0.6763005780346821 	 F1-score: 0.5993980890789206


In [271]:
preds_nltk = predict_nltk(sentences)

Entities distribution:
Locations: 1726
Organizations: 1079
Persons: 1015

Unique words entities distribution:
Locations: 93
Organizations: 73
Persons: 79


In [272]:
calc_metrics(preds_nltk, labels)

Persons:
Precision: 0.6216748768472906 	 Recall: 0.7096774193548387 	 F1-score: 0.6627676589243513


Locations:
Precision: 0.492468134414832 	 Recall: 0.7844311377245509 	 F1-score: 0.6050709674614733


Organizations:
Precision: 0.46617238183503246 	 Recall: 0.5606936416184971 	 F1-score: 0.5090827516407382
