Named Entity Recognition with NLTK and SpaCy

Text classification is a type of Natural Language Processing (NLP). NLP can be simply defined as teaching an algorithm to read and analyze human (natural) languages just like a human would, but a lot faster, more accurately and on very large amounts of data.

NLTK

In [1]:
#apply word tokenization and part-of-speech tagging to the sentence
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
sent = preprocess(ex) 
sent

NameError: name 'preprocess' is not defined

In [None]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [None]:
#Create a chunk pattern
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [None]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse('sent')
print(cs)

In [None]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

In [None]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

SpaCy

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

In [None]:
#entity type
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

In [None]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
#ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

In [None]:
sentences = [x for x in article.sents]
print(sentences[20])

In [None]:
#displaying the entities
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [None]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

In [None]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

In [None]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])