In [5]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

## Intro to NLP
- So far data has been at least minimally processed, i.e. translated into numerical values and organized into variables
- Most available information is verbal; words are full of data

## Processing and analysis
- NLP as a two-part problem:
    1. Process data from its original form (text or speech) into one a computer can understand
    2. Conduct analysis on the processed data
- Step 1 involves cleaning and/or feature extraction
    - **Language parsing:** dealing with verbal information
    - Domain knowledge: word frequency, meaning, grammar, used to extract features of interest
    - Already did some light language parsing building naive bayes spam filter
    
### NLP Packages
- **NLTK (Natural Language ToolKit):**
    - Customizable and transparent (good for learning)
    - Contains older models/methods that may not be optimal for production code
- **spaCy:** 
    - Processes text using latest & greatest algorithms/methods
    - Leaner & faster than NLTK
    - Loose choice, if spaCy algos change, results may change
    - Written in Cython (python translated into C then run)
- **re:** regular expressions library to pull out specific elements from strings (then passed onto spaCy)

In [6]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Cleaning

In [7]:
from nltk.corpus import gutenberg, stopwords

#grab and process raw data
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

#print first 100 chars of alice in wonderland
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [8]:
#remove title, match text and replace with empty string
pattern = '[\[].*?[\]]'
persuasion = re.sub(pattern,'', persuasion)
alice = re.sub(pattern, '', alice)

print('title removed:\n', alice[0:100])

title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [9]:
#match & remove chapter headings
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

print('chapter headings removed:\n', alice[0:100])

chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [10]:
#remove newlines and other whitespace by splitting & rejoining
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

print('extra whitespace removed:\n', alice[0:100])

extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


## What information can we extract from text?

### Tokens
- **Token:** individual meaningful piece from a text, generally words & punctuation
- **Tokenization:** process of breaking up text into tokens
- May discard some tokens that don't add informational value (such as punctuation)
- Stop words:
    - Class of potentially uninformative tokens
    - Includes frequently used words without much informational value ('the', 'of', etc)
    - May or may not be discarded based on NLP approach    

In [11]:
#stopwords identified by NLTK
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
import spacy
nlp = spacy.load('en')

#parse novels into tokens using spacy
#calling spacy on the novel immediately & automatically parses it
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [13]:
#explore objects
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34430 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [14]:
from collections import Counter
#extract information from tokenized text data
#count how often various tokens occur

#utility function to calculate how frequently words appear in text
def word_frequencies(text, include_stop=True):
    
    #build list of words, strip punctuation and give option for stop words
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
    
    #build and return counter object containing word counts
    return Counter(words)

#get most frequent words
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('alice:', alice_freq)
print('persuasion:', persuasion_freq)

alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 534), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1121)]


In [15]:
#run again removing stop words using optional keyword argument
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('alice:', alice_freq)
print('persuasion:', persuasion_freq)

alice: [('I', 534), ('said', 453), ('Alice', 394), ("n't", 215), ("'s", 190), ('little', 124), ('The', 102), ('like', 84), ('went', 83), ('know', 83)]
persuasion: [('I', 1121), ('Anne', 497), ("'s", 485), ('She', 326), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 255), ('He', 225), ('Wentworth', 217)]


In [16]:
#remove words in top 10 for both books
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

#use sets to find unique values in each top 10
print('unique to alice:', set(alice_common) - set(persuasion_common))
print('unique to persuasion:', set(persuasion_common) - set(alice_common))

unique to alice: {'went', 'know', "n't", 'Alice', 'said', 'like', 'little', 'The'}
unique to persuasion: {'Mr', 'Wentworth', 'Mrs', 'Anne', 'He', 'She', 'Captain', 'Elliot'}


### Lemmas
- Use lemma (root word) to focus on an action or concept without splitting across all different forms of a word
    - I.e. think, thought, thinking
- Build a count of concepts by reducing words to their lemma and do counts again

In [18]:
#function to calculate lemma frequency
def lemma_frequencies(text, include_stop=True):
    
    #build list of lemmas, strip punctuation and give option for stopwords
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    #counter object
    return Counter(lemmas)

#instatiate lists of common lemmas
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('persuasion:', persuasion_lemma_freq)

#id the lemmas common to one text but not the other
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))


Alice: [('-PRON-', 758), ('say', 476), ('alice', 396), ('be', 254), ('not', 231), ('go', 133), ('think', 131), ('little', 126), ('the', 109), ('look', 105)]
persuasion: [('-PRON-', 2241), ('anne', 497), ("'s", 466), ('captain', 303), ('elliot', 295), ('mrs', 291), ('good', 289), ('know', 258), ('think', 256), ('mr', 255)]
Unique to Alice: {'look', 'be', 'the', 'say', 'not', 'little', 'alice', 'go'}
Unique to Persuasion: {"'s", 'know', 'mrs', 'elliot', 'good', 'anne', 'mr', 'captain'}


### Sentences

- Split text into sentences using punctuation
- Sentiment analysis can categorize each sentence as positive or negative
- Sentence length, unique words, and contextual information can also be useful
- Use spaCy doc.sents to get each sentence as a span object

In [22]:
sentences = list(alice_doc.sents)
print('alice has {} sentences.'.format(len(sentences)))

example_sentence = sentences[2]
print('example sentence from alice: \n{}'.format(example_sentence))

alice has 1678 sentences.
example sentence from alice: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!


In [24]:
#some metrics for this sentence
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(('there are {} words and {} are unique'
      ).format(len(example_words), len(unique_words)))

there are 29 words and 25 are unique


### Parts of speech
- Tokens within each sentence are coded with the parts of speech they play
- Useful for distinguishing between homographs (words with same spelling but different meaning)
- Polysemy: umbrella term for this kind of linguistic feature

In [25]:
print(nlp('I need a break')[3].pos_)
print(nlp('I need to break the glass')[3].pos_)

NOUN
VERB


In [26]:
#view parts of speach for some tokens in sentence
print('\nparts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


parts of speech:
There ADV
was VERB
nothing NOUN
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


### Dependencies
- How words related to each other syntactically
- [Stanford group dependencies page](https://nlp.stanford.edu/software/stanford-dependencies.shtml)

In [27]:
#view dependencies for some tokens
print('dependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)

dependencies:
There expl was
was ROOT was
nothing attr was
so advmod remarkable
VERY advmod remarkable
remarkable amod nothing
in prep nothing
that pobj in
; punct was


### Entities
- Some errors: unless an obvious rule applied, spaCy id rules assume that any word/phrase in all caps is an organization or a event

In [29]:
#extract first ten entities
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
DATE the hot day
PERSON Alice
PRODUCT Rabbit
PRODUCT Rabbit
PRODUCT WAISTCOAT - POCKET
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First


In [30]:
#all of the unique entities spaCy identifies as people
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == 'PERSON']
print(set(people))

{'Mock Turtle', 'the Lobster Quadrille?', 'Shakespeare', 'the March Hare', 'Gryphon', 'Morcar', 'Stupid', 'HAD', 'Repeat', 'The Fish-Footman', 'Rabbit', 'Jack', 'Panther', 'Soo', 'M--', 'Longitude', 'Alice', 'Elsie', 'this:--', 'Soles', 'Fury', 'Duchess', 'Mabel', 'Majesty', 'The White Rabbit', 'Pinch', 'the White Rabbit', 'Seaography', 'indeed:--', 'The Queen', 'Edwin', 'Turn', 'Sha', 'Beau', 'Edgar Atheling', 'Fifteenth', '--or', 'Kings', 'Queen', 'Shy', 'Frog-Footman', 'Duck', 'Down', 'Curiouser', 'Cheshire Puss', "the Duchess: '", 'Tillie', 'FUL SOUP', 'the Lobster Quadrille', 'Latin Grammar', 'Shall', 'Drink', 'Ma', 'Beautiful Soup', 'Tut', 'INSIDE', 'Hush', 'Treacle', 'Stolen', 'Latitude', 'Mary Ann', 'Footman', 'The Mock Turtle', 'the Duchess', 'Run', 'Canary', 'the King', 'a Lobster Quadrille', 'William', "the Mock Turtle: '", 'Hjckrrh', 'Sixteenth', 'Soup of the evening', 'WILLIAM', 'Stand', 'Lacie', 'm--', 'Idiot', 'Fetch', 'began:--', 'Ou', 'William the Conqueror', 'Sentence