# Spacy Exploration
This notebook derived from the Spacy part of [Named Entity Recognition with NLTK and SpaCy](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da)

In [21]:
import spacy
from spacy import displacy
from collections import Counter

In [22]:
# Use default English model
import en_core_web_sm
nlp = en_core_web_sm.load()

# Use pretty printer
import pprint
pp = pprint.PrettyPrinter()

## 1. Analyze short sentences
In these samples, I'm trying to understand how Spacy `en_core_web_sm` model differentiates Kentucky the following 2 sentences.

In [39]:
simple_doc_1 = nlp("I go to a place called Kentucky Fried Chicken")

In [43]:
pp.pprint([(t, t.ent_iob_, t.ent_type_, t.dep_, t.lemma_, t.pos_) for t in simple_doc_1])
# Notes: Kentucky Fried Chicken is detection as a chunk (BII tag), and ORG (companies, agencies, institutions)

[(I, 'O', '', 'nsubj', '-PRON-', 'PRON'),
 (go, 'O', '', 'ROOT', 'go', 'VERB'),
 (to, 'O', '', 'prep', 'to', 'ADP'),
 (a, 'O', '', 'det', 'a', 'DET'),
 (place, 'O', '', 'pobj', 'place', 'NOUN'),
 (called, 'O', '', 'acl', 'call', 'VERB'),
 (Kentucky, 'B', 'ORG', 'compound', 'kentucky', 'PROPN'),
 (Fried, 'I', 'ORG', 'compound', 'fried', 'PROPN'),
 (Chicken, 'I', 'ORG', 'oprd', 'chicken', 'PROPN')]


In [41]:
simple_doc_2 = nlp("I go to a place called Kentucky")
pp.pprint([(t, t.ent_iob_, t.ent_type_, t.dep_, t.lemma_, t.pos_) for t in simple_doc_2])
# Notes: Kentucky detected as a chunk (B tag), GPE (Geopolitical entity, i.e countries, cities, state)

[(I, 'O', '', 'nsubj', '-PRON-', 'PRON'),
 (go, 'O', '', 'ROOT', 'go', 'VERB'),
 (to, 'O', '', 'prep', 'to', 'ADP'),
 (a, 'O', '', 'det', 'a', 'DET'),
 (place, 'O', '', 'pobj', 'place', 'NOUN'),
 (called, 'O', '', 'acl', 'call', 'VERB'),
 (Kentucky, 'B', 'GPE', 'oprd', 'kentucky', 'PROPN')]


## 2. Analyze a long sentence.

In [42]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [24]:
# Print NER result
pp.pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [25]:
# Print chunking result (IOB tagging)
pp.pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [26]:
# Print chunking, NER, dependency parsing, lemmatization, POS tagging results
pp.pprint([(t, t.ent_iob_, t.ent_type_, t.dep_, t.lemma_, t.pos_) for t in doc])

[(European, 'B', 'NORP', 'amod', 'european', 'ADJ'),
 (authorities, 'O', '', 'nsubj', 'authority', 'NOUN'),
 (fined, 'O', '', 'ROOT', 'fin', 'VERB'),
 (Google, 'B', 'ORG', 'dative', 'google', 'PROPN'),
 (a, 'O', '', 'det', 'a', 'DET'),
 (record, 'O', '', 'dobj', 'record', 'NOUN'),
 ($, 'B', 'MONEY', 'quantmod', '$', 'SYM'),
 (5.1, 'I', 'MONEY', 'compound', '5.1', 'NUM'),
 (billion, 'I', 'MONEY', 'nummod', 'billion', 'NUM'),
 (on, 'O', '', 'prep', 'on', 'ADP'),
 (Wednesday, 'B', 'DATE', 'pobj', 'wednesday', 'PROPN'),
 (for, 'O', '', 'prep', 'for', 'ADP'),
 (abusing, 'O', '', 'pcomp', 'abuse', 'VERB'),
 (its, 'O', '', 'poss', '-PRON-', 'ADJ'),
 (power, 'O', '', 'dobj', 'power', 'NOUN'),
 (in, 'O', '', 'prep', 'in', 'ADP'),
 (the, 'O', '', 'det', 'the', 'DET'),
 (mobile, 'O', '', 'amod', 'mobile', 'ADJ'),
 (phone, 'O', '', 'compound', 'phone', 'NOUN'),
 (market, 'O', '', 'pobj', 'market', 'NOUN'),
 (and, 'O', '', 'cc', 'and', 'CCONJ'),
 (ordered, 'O', '', 'conj', 'order', 'VERB'),
 (the, 

## 3. Analyzing a paragraph from NYT website (and visualize the results)

In [27]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
nyt_strzok = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

In [28]:
nyt_doc = nlp(nyt_strzok)

In [29]:
len(nyt_doc.ents)

175

In [30]:
labels =[x.label_ for x in nyt_doc.ents]
Counter(labels)

Counter({'PERSON': 81,
         'GPE': 30,
         'ORG': 22,
         'DATE': 31,
         'CARDINAL': 4,
         'EVENT': 1,
         'NORP': 5,
         'ORDINAL': 1})

In [31]:
items = [x.text for x in nyt_doc.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

In [32]:
sentences = [x for x in nyt_doc.sents]
a_sentence = sentences[20]
print(a_sentence)

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [33]:
displacy.render(nlp(str(a_sentence)), jupyter=True, style='ent')

In [34]:
displacy.render(nlp(str(a_sentence)), style='dep', jupyter = True, options = {'distance': 120})

In [35]:
pp.pprint([(t, t.ent_iob_, t.ent_type_, t.dep_, t.lemma_) for t in a_sentence])


[(Firing, 'O', '', 'csubj', 'fire'),
 (Mr., 'O', '', 'compound', 'mr.'),
 (Strzok, 'B', 'PERSON', 'dobj', 'strzok'),
 (,, 'O', '', 'punct', ','),
 (however, 'O', '', 'advmod', 'however'),
 (,, 'O', '', 'punct', ','),
 (removes, 'O', '', 'ROOT', 'remove'),
 (a, 'O', '', 'det', 'a'),
 (favorite, 'O', '', 'amod', 'favorite'),
 (target, 'O', '', 'dobj', 'target'),
 (of, 'O', '', 'prep', 'of'),
 (Mr., 'O', '', 'compound', 'mr.'),
 (Trump, 'B', 'PERSON', 'pobj', 'trump'),
 (from, 'O', '', 'prep', 'from'),
 (the, 'O', '', 'det', 'the'),
 (ranks, 'O', '', 'pobj', 'rank'),
 (of, 'O', '', 'prep', 'of'),
 (the, 'O', '', 'det', 'the'),
 (F.B.I., 'B', 'GPE', 'pobj', 'f.b.i.'),
 (and, 'O', '', 'cc', 'and'),
 (gives, 'O', '', 'conj', 'give'),
 (Mr., 'O', '', 'compound', 'mr.'),
 (Bowdich, 'B', 'PERSON', 'dative', 'bowdich'),
 (and, 'O', '', 'cc', 'and'),
 (the, 'O', '', 'det', 'the'),
 (F.B.I., 'B', 'GPE', 'amod', 'f.b.i.'),
 (director, 'O', '', 'conj', 'director'),
 (,, 'O', '', 'punct', ','),
 (Chr