# Alice in the Wonderland of NLP
We'll be using the story "Alice's Adventures in Wonderland" by Lewis Carroll (1865).
The story is in public domain; the text file was obtained from nltk.

In [11]:
import nltk
import spacy
from spacy import displacy

In [12]:
# import the "Alice in the Wonderland" by Lewis Carrol
path = nltk.data.find('corpora/gutenberg/carroll-alice.txt')
alice = open(path, 'r').read()

In [13]:
# setting the basics configs
nlp = spacy.load('en_core_web_sm')
doc = nlp(alice)

In [14]:
# for every token in the 51th sentence, print the token text, the POS tag, the fine-grained
# TAG tag, and the description of the fine-grained tag.
for token in list(doc.sents)[50]:
    if token is not None:
        try:
            print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")
        except TypeError as e:
            print(e)

But        CCONJ      CC         conjunction, coordinating
do         VERB       VB         verb, base form
cats       NOUN       NNS        noun, plural
eat        VERB       VB         verb, base form
bats       NOUN       NNS        noun, plural
,          PUNCT      ,          punctuation mark, comma
I          PRON       PRP        pronoun, personal
wonder     VERB       VBP        verb, non-3rd person singular present
?          PUNCT      .          punctuation mark, sentence closer
'          PUNCT      ''         closing quotation mark


In [15]:
# Calculate the frequency list of POS tags from the entire document
POS_counts = doc.count_by(spacy.attrs.POS)
for key,value in sorted(POS_counts.items()):
    print(f"{key} {doc.vocab[key].text} {value} counts")

83 ADJ 2140 counts
84 ADP 2939 counts
85 ADV 2737 counts
88 CCONJ 1132 counts
89 DET 2898 counts
90 INTJ 159 counts
91 NOUN 3719 counts
92 NUM 205 counts
93 PART 859 counts
94 PRON 2833 counts
95 PROPN 1438 counts
96 PUNCT 7235 counts
99 VERB 6233 counts
100 X 1 counts
102 SPACE 2530 counts


In [16]:
# Percentage of tokens are adjectives
100*POS_counts[83]/len(doc)

5.774731501969885

In [28]:
# render dependency parse for the 6th sentence
displacy.render(list(doc.sents)[5],style='dep',jupyter = True)

# using web browser to visualize the dependency parser
# displacy.serve(list(doc.sents)[50],style='dep')

In [31]:
# Show the first two named entities from the doc
for ent in doc.ents[:2]:
    print(ent.text + '   ' + ent.label_ + '   '  + str(spacy.explain(ent.label_)))

Alice's Adventures   ORG   Companies, agencies, institutions, etc.
Wonderland   GPE   Countries, cities, states


In [32]:
# Number of sentences in the document
len(list(doc.sents))

1705

In [33]:
# Number of sentences that contains entities
list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.ents]
len(list_of_ners)

1212

In [47]:
# render entities in the first line
displacy.render(list_of_sents[0], style='ent',jupyter = True)