In [38]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy

from spacy.tokens import Token
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import gutenberg, stopwords
from collections import Counter

%matplotlib inline

In [2]:
import nltk
# Launch the installer to download 'gutenberg'
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [60]:
stop_words_getter = lambda token: token.is_stop or token.lower_ in STOP_WORDS 
Token.set_extension('is_stop', getter=stop_words_getter, force=True)  # set attribute with getter


In [61]:
# Grab and process the raw data
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Print eh first 100 characters of Alice in Wonderland
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [62]:
# This pattern matches all text between square brackets
pattern = '[\[].*?[\]]'
persuasion = re.sub(pattern, '', persuasion)
alice = re.sub(pattern, '', alice)

# Print first 100 characters of alice again
print('Title removed:\n', alice[0:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [63]:
# Now we'll match and remove chapter headings
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# Ok, what's it look like now?
print('Chapter headings remvoed:\n', alice[0:100])

Chapter headings remvoed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [64]:
# Remove newlines and other extra whitespace by splitting and rejoining
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

# All done with cleanup? Let's see how it looks
print('Extra whitespace removed and file lowercased:\n', alice[0:100])

Extra whitespace removed and file lowercased:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [65]:
# List of stopwords identified by NLTK
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Spacy
Use spaCy to parse the novels into tokens.  SpaCy will parse it, tokenize the string, etc.

In [66]:
nlp = spacy.load('en')

# All the processing work is done here, so it may take a while.
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [67]:
# Let's explore the objects
print('The alice_doc object is a {} object.'.format(type(alice_doc)))
print('It is {} tokens long'.format(len(alice_doc)))
print('The first three tokens are "{}"'.format(alice_doc[:3]))
print('The type of each token is {}'.format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34430 tokens long
The first three tokens are "Alice was beginning"
The type of each token is <class 'spacy.tokens.token.Token'>


In [68]:
# We can count tokens

# Utility function to calculate how frequently words appear in the text
def word_frequencies(text, include_stop=True):
    
    # Build a list of words
    # Strip out punctuation and optionally, stop words
    words = []
    for token in text:
        if not token.is_punct and (not token._.is_stop or include_stop):
            words.append(token.text)
    
    # Build and return a Counter object containing word counts
    return Counter(words)

# The most frequent words
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 534), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1121)]


In [69]:
# Try again leaving out stop words
# Use our optional keyword argument to remove stop words.
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('said', 453), ('Alice', 394), ("n't", 215), ("'s", 190), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73)]
Persuasion: [('Anne', 497), ("'s", 485), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 255), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175)]


In [70]:
# Pull out just the text from frequency lists
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

# Use sets to find teh unique values in each top ten
print('Unique to Alice:', set(alice_common) - set(persuasion_common))
print('Unique to Persuasion:', set(persuasion_common) - set(alice_common))

Unique to Alice: {'Queen', 'thought', "n't", 'like', 'Alice', 'went', 'know', 'said'}
Unique to Persuasion: {'Mrs', 'Wentworth', 'Mr', 'Captain', 'good', 'Lady', 'Elliot', 'Anne'}


# Lemmas - root of each word

In [71]:
# Utility function to calculate how frequently lemmas appear in the text
def lemma_frequencies(text, include_stop=True):
    
    # Buile a list of lemmas
    # Strip out punctuation and stop words
    lemmas = []
    for token in text:
        if not token.is_punct and (not token._.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts
    return Counter(lemmas)

# Instantiate our list of most common lemmas
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)

# Again, identify the lemmas common to one text but not the other
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))


Alice: [('say', 476), ('alice', 396), ('be', 232), ('not', 215), ('think', 131), ('go', 130), ('little', 126), ('look', 105), ('know', 103), ('come', 97)]
Persuasion: [('anne', 497), ("'s", 466), ('captain', 303), ('elliot', 295), ('mrs', 291), ('good', 289), ('know', 258), ('think', 256), ('mr', 255), ('lady', 242)]
Unique to Alice: {'look', 'not', 'be', 'come', 'say', 'alice', 'go', 'little'}
Unique to Persuasion: {'anne', 'captain', 'mrs', "'s", 'good', 'elliot', 'mr', 'lady'}


# Sentences

In [72]:
# Initial exploration of sentences
sentences = list(alice_doc.sents)
print('Alice in Wonderland as {} sentences.'.format(len(sentences)))

example_sentence = sentences[2]
print('Here is an example: \n{}\n'.format(example_sentence))

Alice in Wonderland as 1678 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [73]:
# Look at some metrics around the sentence
# Look at some metrics around this sentence.
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are"
       " unique.").format(len(example_words), len(unique_words)))

There are 29 words in this sentence, and 25 of them are unique.


# Parts of speech, dependencies, entities

In [74]:
print(nlp('I need a break')[3].pos_)
print(nlp('I need to break the glass')[3].pos_)

NOUN
VERB


In [75]:
# View the part of speech for some tokens in our sentence.
print('\nParts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of speech:
There ADV
was VERB
nothing NOUN
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


In [76]:
# View the dependencies for some tokens.
print('\nDependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
There expl was
was ROOT was
nothing attr was
so advmod remarkable
VERY advmod remarkable
remarkable amod nothing
in prep nothing
that pobj in
; punct was


In [77]:
# Extract the first ten entities.
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
DATE the hot day
PERSON Alice
PRODUCT Rabbit
PRODUCT Rabbit
PRODUCT WAISTCOAT - POCKET
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First


In [78]:
# All of the uniqe entities spaCy thinks are people.
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{'Brandy', 'William', 'Soles', 'Prizes', 'The Queen', 'Jack', 'Longitude', 'Sha', 'Gryphon', 'FUL SOUP', 'Majesty', 'Mabel', 'Panther', 'YOURS', 'Soup', 'Shakespeare', 'Turtle Soup', "the King: '", 'Pinch', 'Latin Grammar', 'Latitude', 'the White Rabbit', 'HAD', 'Mary Ann', 'a Lobster Quadrille', 'M--', 'Begin', 'the Lobster Quadrille', 'Footman', 'Curiouser', 'the Queen of Hearts', "Dinah'll", 'Lacie', 'Sentence', 'Beautiful Soup', 'the Duchess', 'Soo', 'Tillie', 'Frog-Footman', 'Edwin', 'Kings', 'WILLIAM', "Don't", 'Fish-Footman', 'this:--', 'Ou', 'm--', 'Soup of the evening', 'Ma', 'Edgar Atheling', 'Adventures', 'the Lobster Quadrille?', 'Duck', 'Shy', 'Duchess', 'Serpent', 'Repeat', 'Pat', 'William the Conqueror', 'Turn', 'The White Rabbit', 'Canary', 'Sixteenth', 'Stupid', 'Treacle', 'Queen', 'The Fish-Footman', 'Shall', 'Down', 'Fifteenth', 'began:--', 'Idiot', 'Bill', 'Tut', 'Run', 'Mock Turtle', 'Fury', 'Hush', 'Cheshire Puss', 'the King', 'Morcar', 'Elsie', 'Seaography', 'Sta

In [1]:
from IPython.html.services.config import ConfigManager
ip = get_ipython()
cm = ConfigManager(parent=ip)
cm.update('notebook', {"ruler_column": [80]})



{'Cell': {'cm_config': {'lineNumbers': False}}, 'ruler_column': [80]}