# Lesson Prepare NLP

In [2]:
import pandas as pd
import numpy as np

In [32]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [33]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

# 1. Lowercase 

In [34]:
original = original.lower()
original

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

# 2. Remove accented characters and non-ASCII characters

In [35]:
import unicodedata

# use codes NFKD
unicodedata.normalize('NFKD', original)


"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [36]:
# gets rid of  accents and ignores characters that it can not encode in ascii
# codes everythng into the ascii standard
unicodedata.normalize('NFKD', original).encode('ascii','ignore')

b"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [37]:
# normalize around utf-8\
# takes characters utf-8 and decodes them, gets rid of them
unicodedata.normalize('NFKD', original).encode('ascii','ignore').decode('utf-8')

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [38]:
original  = unicodedata.normalize('NFKD', original).encode('ascii','ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 3. Remove any special characters

In [39]:
import re

In [40]:
re.sub('[^a-z)-9\'\s]', '',original)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' 'o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [71]:
# apply to original
original = re.sub('[^a-z0-9\'\s]', '',original)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 4. Tokenize

In [72]:
# break things down into smaller units. Seperates punctuation from other characters.

In [73]:
import nltk

In [74]:
# create an object
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x12ad2f310>

In [75]:
# string is true
tokenize.tokenize(original, return_str=True)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [76]:
# string is false will create a list of words
#tokenize.tokenize(original, return_str=False)

In [81]:
# assigned to original 
original  = tokenize.tokenize(original, return_str=True)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

# 5. Stemming or Lemmatizing

### stemming

In [None]:
# stemming is good for large amount of data but not a large amount of processing power
# stem is useful for a lot of data that needs to be processed quickly
# lemmatizing is more sophisticated. Good for fine tunning.

In [77]:
# create object
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [78]:
# stem changes all words to call
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [79]:
# we lose the e in the word house
ps.stem('house'), ps.stem('housing'),ps.stem('housed')

('hous', 'hous', 'hous')

In [80]:
ps.stem(original)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [82]:
# seems to change algorithm when it is in a long string. behavior is diffirenct per word.
ps.stem('contributed')

'contribut'

In [83]:
# so we will apply stem per word
[ps.stem(word) for word in original.split()]

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdo',
 "'",
 's',
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'doubl',
 'acut',
 'accent',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdo',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

In [85]:
# join back the words by saing on space join this words
stems = [ps.stem(word) for word in original.split()]
' '.join(stems)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### lemmatize

In [None]:
# more sophisticated, has a different algorithm, requires more resources to run.
# perserves more meaning, gives ability to fine tune.

In [None]:
# run the first time you use in new computer, then you do not need to run again
# nltk.download('all')

In [87]:
original # set tokenizer

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [89]:
# this is a lemmatizer even though its named under stem, it is not a stemmer

# create object
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [93]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called')

('calling', 'call', 'called')

In [94]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

##### mouse vs mice The lemmatizer understands that mouse is a word, the stemmer does not.

In [95]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [97]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

In [101]:
# apply lemmatizer to original
[wnl.lemmatize(word) for word in original.split()]


['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [102]:
lemmas = [wnl.lemmatize(word) for word in original.split()]
" ".join(lemmas)

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

In [103]:
# not saved back to original to be explicit about only use one, stem or lemma not both!!!!!

# 6. Remove Stopwords

In [105]:
from nltk.corpus import stopwords

In [106]:
# Run this once per computer
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yvetteibarra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [117]:
stopwords_eng = stopwords.words('english')
stopwords_eng

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [118]:
# length of stopword list
len(stopwords_eng)

179

In [119]:
# we can add more to the list of stopwords
stopwords_eng.append('o')
stopwords_eng.append(" ' ")

In [120]:
len(stopwords_eng)

181

In [122]:
# this is the stopwords applied(taken out of) the original text
[word for word in original.split() if word not in stopwords_eng]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematicians',
 'contributed',
 'lot',
 'field',
 'erdos',
 "'",
 'name',
 'contains',
 'hungarian',
 'letter',
 "'",
 "'",
 "'",
 "'",
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']