In [1]:
# RUN THIS CELL to perform standard imports:
import spacy, os
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [2]:
# create a doc object from the file peterrabbit.txt
# change the directory to the location of the file
os.chdir("/Users/Bryan/Documents/Programming/PythonNLP/UPDATED_NLP_COURSE/TextFiles")
os.listdir()


['peterrabbit.txt',
 'elon_tweets.txt',
 'amazonreviews.tsv',
 'reuters.csv',
 'owlcreek1.txt',
 'reaganomics1.txt',
 'elon_tweets2.txt',
 'owlcreek.txt',
 'sms_readme.txt',
 'reaganomics.txt',
 'moviereviews.tsv',
 'smsspamcollection.tsv',
 'huckfinn.txt',
 'moviereviews2.tsv']

In [3]:
# create a doc object from the file peterrabbit.txt

with open('peterrabbit.txt', encoding='utf8') as f:
    doc = nlp(f.read())

In [4]:
# For every token in the third sentence, print the token text, 
# the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.

for token in list(doc.sents)[2]:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

They       PRON     PRP    pronoun, personal
lived      VERB     VBD    verb, past tense
with       ADP      IN     conjunction, subordinating or preposition
their      ADJ      PRP$   pronoun, possessive
Mother     PROPN    NNP    noun, proper singular
in         ADP      IN     conjunction, subordinating or preposition
a          DET      DT     determiner
sand       NOUN     NN     noun, singular or mass
-          PUNCT    HYPH   punctuation mark, hyphen
bank       NOUN     NN     noun, singular or mass
,          PUNCT    ,      punctuation mark, comma
underneath ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
root       NOUN     NN     noun, singular or mass
of         ADP      IN     conjunction, subordinating or preposition
a          DET      DT     determiner

          SPACE           None
very       ADV      RB     adverb
big        ADJ      JJ     adjective
fir        NOUN     NN     noun, singular or mass
-          PUNCT   

In [33]:
# get a frequency list of POS tags

POS_counts = doc.count_by(spacy.attrs.POS)

for k,v in sorted(POS_counts.items()):
    print(f"ID {k}. POS {doc.vocab[k].text:{5}} {v}")

ID 83. POS ADJ   83
ID 84. POS ADP   127
ID 85. POS ADV   75
ID 88. POS CCONJ 61
ID 89. POS DET   90
ID 91. POS NOUN  176
ID 92. POS NUM   8
ID 93. POS PART  36
ID 94. POS PRON  72
ID 95. POS PROPN 75
ID 96. POS PUNCT 174
ID 99. POS VERB  182
ID 102. POS SPACE 99


In [31]:
len(doc)

1258

In [17]:
# what percentage are nouns - 91

percent_nouns = 100*(POS_counts[91]/len(doc))

print(f"There are {percent_nouns:{.4}}% nouns in the text")

There are 13.99% nouns in the text


In [20]:
# Display the Dependency Parse for the third sentence

third_sent = list(doc.sents)[2]


displacy.render(third_sent, style='dep', jupyter=True, options = {'distance': 75})

In [34]:
doc.ents

(The Tale of Peter Rabbit, Beatrix Potter, 1902, Rabbits, , 
 
           Flopsy, 
        , 
    , Cotton, , Peter, , Rabbit, one morning, , McGregor, , 
 McGregor, Now, 
 , Rabbit, , five, , Flopsy, Mopsy, Cottontail, , Peter, McGregor, , First, French, , 
 McGregor, McGregor, , Peter, , Peter, , one, , four, , , , Peter, , , McGregor, , Peter, Peter, , , McGregor, Peter, , , Peter, McGregor, , Peter, , three, McGregor, , Peter, Peter, , , , , , Peter, , , Peter, , 
 McGregor, , , Peter, , , little Benjamin Bunny, , , Peter, , , first, McGregor, , Peter, Peter, , , McGregor, Peter, , , McGregor, , Peter, , , , , second, Peter, , Peter, the evening, , Peter!
 
 ', One, Flopsy, Mopsy, Cotton, )

In [24]:
# Show the first two named entities from Beatrix Potter's *The Tale of Peter Rabbit **

for ent in doc.ents[:2]:
    print(ent.text, " --",  ent.label_, "--", str(spacy.explain(ent.label_)))

The Tale of Peter Rabbit  -- WORK_OF_ART -- Titles of books, songs, etc.
Beatrix Potter  -- PERSON -- People, including fictional


In [26]:
# How many sentences are contained in The Tale of Peter Rabbit?

len(list(doc.sents))

56

In [28]:
# CHALLENGE: How many sentences contain named entities
# first get the text of the sentences

all_sents = [nlp(sent.text) for sent in doc.sents]

# check for entities in the sentences
list_of_named = [doc for doc in all_sents if doc.ents]

len(list_of_named)
    

49

In [35]:
# CHALLENGE: Display the named entity visualization for list_of_sents[0] from the previous problem

# get the sentence first
to_display = all_sents[0]

# display the viz

options = options={'distance': 90,
                   'compact':'True',
                   'color':'yellow',
                   'bg':'#09a3d5',
                   'font':'Times'}
displacy.render(to_display, style='ent', jupyter=True, options=options)