## Named Entity Recognition (NER)
Named Entity Recognition (NER) is an essential task of the more general discipline of Information Extraction (IE). To obtain structured information from unstructured text we wish to identify named entities. Anything with a proper name is a named entity. This would include names of people, places, organizations, vehicles, facilities, and so on.

### Installation Steps:
* pip install -U pip setuptools wheel
* pip install -U spacy==2.3.0
* python -m spacy download en_core_web_sm
* python -m spacy validate

In [15]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core_web_sm')

In [16]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [17]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [18]:
from bs4 import BeautifulSoup
import requests
import re

In [19]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [20]:

#ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

ny_bb = url_to_string('https://kstp.com/news/after-chauvins-conviction-for-floyd-murder-doj-weighs-charging-him-for-2017-incident-involving-black-teen/6084716/')
article = nlp(ny_bb)
len(article.ents)

197

In [21]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 61,
         'DATE': 27,
         'ORG': 52,
         'GPE': 19,
         'PRODUCT': 8,
         'CARDINAL': 16,
         'TIME': 7,
         'NORP': 3,
         'MONEY': 1,
         'QUANTITY': 1,
         'LOC': 1,
         'WORK_OF_ART': 1})

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Chauvin', 37), ('Floyd', 11), ('2017', 10)]

In [24]:
sentences = [x for x in article.sents]
print(sentences[5])

Safe MN WEATHER Interactive Radar Forecast Severe Weather Guide Temperatures Outdoors School Alert TRAFFIC Traffic Map Top Stories INVESTIGATIVE Top Stories Send Us Tips SPORTS Top Stories Vikings Wild Twins


In [37]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')



In [38]:

displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [39]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Minnesota', 'PROPN', 'Minnesota'),
 ('Coronavirus', 'PROPN', 'Coronavirus'),
 ('Stories', 'PROPN', 'Stories'),
 ('School', 'PROPN', 'School'),
 ('Planning', 'PROPN', 'Planning'),
 ('Data', 'PROPN', 'Data'),
 ('School', 'PROPN', 'School'),
 ('Alert', 'PROPN', 'Alert'),
 ('Stay', 'VERB', 'stay'),
 ('Safe', 'ADJ', 'safe'),
 ('MN', 'PROPN', 'MN'),
 ('Weather', 'PROPN', 'Weather'),
 ('Interactive', 'PROPN', 'Interactive'),
 ('Radar', 'PROPN', 'Radar'),
 ('Forecast', 'PROPN', 'Forecast'),
 ('Severe', 'PROPN', 'Severe'),
 ('Weather', 'PROPN', 'Weather'),
 ('Guide', 'PROPN', 'Guide'),
 ('Temperatures', 'PROPN', 'Temperatures'),
 ('Outdoors', 'PROPN', 'Outdoors'),
 ('School', 'PROPN', 'School'),
 ('Alert', 'PROPN', 'Alert'),
 ('Traffic', 'PROPN', 'Traffic'),
 ('Traffic', 'PROPN', 'Traffic'),
 ('Map', 'PROPN', 'Map'),
 ('Stories', 'PROPN', 'Stories'),
 ('Investigative', 'PROPN', 'Investigative'),
 ('Stories', 'NOUN', 'story'),
 ('Send', 'VERB', 'send'),
 ('Tips', 'PROPN', 'Tips'),
 ('Sports',

In [40]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{}

In [41]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(So, 'O', ''), (Minnesota, 'O', ''), (Coronavirus, 'O', ''), (Top, 'B', 'ORG'), (Stories, 'I', 'ORG'), (School, 'I', 'ORG'), (Planning, 'I', 'ORG'), (and, 'I', 'ORG'), (Data, 'I', 'ORG'), (School, 'I', 'ORG'), (Alert, 'I', 'ORG'), (Stay, 'I', 'ORG'), (Safe, 'I', 'ORG'), (MN, 'I', 'ORG'), (Weather, 'I', 'ORG'), (Interactive, 'I', 'ORG'), (Radar, 'I', 'ORG'), (Forecast, 'I', 'ORG'), (Severe, 'I', 'ORG'), (Weather, 'I', 'ORG'), (Guide, 'I', 'ORG'), (Temperatures, 'I', 'ORG'), (Outdoors, 'I', 'ORG'), (School, 'I', 'ORG'), (Alert, 'O', ''), (Traffic, 'O', ''), (Traffic, 'O', ''), (Map, 'O', ''), (Top, 'O', ''), (Stories, 'O', ''), (Investigative, 'O', ''), (Top, 'O', ''), (Stories, 'O', ''), (Send, 'O', ''), (Us, 'O', ''), (Tips, 'O', ''), (Sports, 'O', ''), (Top, 'O', ''), (Stories, 'O', ''), (Vikings, 'O', ''), (Wild, 'O', ''), (Twins, 'O', '')]


In [42]:
displacy.render(article, jupyter=True, style='ent')