## Tokenizing
- By word: break down the writing into each word
- By sentence: break down the writing into each sentence

In [3]:
# Import dependencies
from nltk.tokenize import sent_tokenize, word_tokenize

In [7]:
# Example string
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [8]:
# Tokenize string by sentence
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [9]:
# Tokenize by word
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

## Filtering Stop Words
*Stop Words*: common words you want to ignore 

In [11]:
# Import dependencies
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
# Example string
worf_quote = "Sir, I protest. I am not a merry man!"

In [14]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [15]:
# Create a set of stop words in english
stop_words = set(stopwords.words('english'))

In [20]:
# Initialize empty list for filtered quote
filtered_list = []

In [21]:
# Filter string using stop word set
for word in words_in_quote:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

## Alternate filtering approach
# filtered_list = 
#   word for word in words_in_quote if word.casefold() not in stop_words
# ]

In [23]:
filtered_list

# Note:
    # Words like 'I' and 'not' have been filtered out, but might be 
    # important to the sentence's meaning. Be aware of what's in a 
    # stop word list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

## Stemming
- Reducing words to their root

In [24]:
# Import dependencies
from nltk.stem import PorterStemmer

In [26]:
# Create stemmer
stemmer = PorterStemmer()

In [27]:
# Example string
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [28]:
# Word Tokenize the string before stemming 
words = word_tokenize(string_for_stemming)

In [30]:
# Stem the words
stemmed_words = [stemmer.stem(word) for word in words]

In [31]:
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

##### Breakdown of words with 'discov' roots

| Original word | Stemmed Version |
| ----------- | ----------- |    
| 'Discovery' | 'discoveri' |
| 'discovered' | 'discov' |
| 'discoveries' | 'discoveri' |
| 'Discovering' | 'discov' |


*Understemming*: two related words *should* be reduced to the same word but *aren't* 
    - False Negative

*Overstemming*: two unrelated words *shouldn't* be reducedto the same word but *are*
    - False Positive


Snowball Stemmer (Porter2) improves on PorterStemmer - also in NLTK

## Tagging Parts of Speech

##### 8 Parts of Speech in English

| Part of Speech | Role | Examples |
|----------------|------|----------|
| Noun | Is a person, place, or thing | mountain, bagel, Poland |
| Pronoun | Replaces a noun | you, she, we |
| Adjective | Gives information about what a noun is like | efficient, windy, cloudy |
| Verb | Is an action or a state of being | learn, is, go |
| Adverb | Gives information about a verb, adjective, or another adverb | efficiently, always, very |
| Preposition | Gives informatino about how a noun or pronoun is connected to another word | from, about, at |
| Conjuction | Connects two other words or phrases | so, because, and |
| Interjection | Is an exclamation | yay, ow, wow |


In [34]:
# Example string
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [35]:
# Import dependencies
from nltk.tokenize import word_tokenize
import nltk

In [36]:
# Tokenize string
words_in_sagan_quote = word_tokenize(sagan_quote)

In [39]:
# Tag parts of speech
nltk.pos_tag(words_in_sagan_quote)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [42]:
# Find tags and their meaning
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Lemmatizing
Similar to stemming, but provides a complete word
| Word | Definition |  Example(s) |
|-------|---------|-------|
| Lemma | A word that represents a whole group of words | ex., Blend |
| Lexeme | Group of words represented by the Lemma | ex., blending, blender |

In [44]:
# Import dependencies
from nltk.stem import WordNetLemmatizer


In [45]:
# Create lemmatizer
lemmatizer = WordNetLemmatizer()

In [50]:
lemmatizer.lemmatize('scarves')

'scarf'

In [51]:
# Example string
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [52]:
# Word tokenize the string
words = word_tokenize(string_for_lemmatizing)

In [54]:
# Lemmatize the word list
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

In [55]:
lemmatized_words

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [56]:
# lemmatizer.lemmatize() thinks 'worst' is a noun
lemmatizer.lemmatize('worst')

'worst'

In [57]:
# This can be fixed by assigning it as an adjective
lemmatizer.lemmatize('worst', pos = 'a')

'bad'

## Chunking

In [None]:
# Import dependencies
import nltk
from 