## Word Tokenization
Tokenization based on word.

ex)
input:  Time is an illusion. Lunchtime double so!
output: "Time", "is", "an", "illusion", "Lunchtime", "double", "so"

In [1]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TEACAT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [17]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\TEACAT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
sentence = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cherry goes for a pastry shop."

In [6]:
print(f"word_tokenize: {word_tokenize(sentence)}")

word_tokenize: ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cherry', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [7]:
print(f"WordPunctTokenizer: {WordPunctTokenizer().tokenize(sentence)}")

WordPunctTokenizer: ['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cherry', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [8]:
print(f"text_to_word_sequence: {text_to_word_sequence(sentence)}")

text_to_word_sequence: ["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cherry', 'goes', 'for', 'a', 'pastry', 'shop']


### Penn Treebank Tokenization

rule 1. Keep word what include hyphen  
rule 2. Clitic with Apostrophe(') like "doesn't" is separated

In [9]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [10]:
print(f"TreebankWordTokenizer: {tokenizer.tokenize(sentence)}")

TreebankWordTokenizer: ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cherry', 'goes', 'for', 'a', 'pastry', 'shop', '.']


## Sentence Tokenization

Tokenization based on sentence also called "sentence segmentation"  

In [11]:
from nltk.tokenize import sent_tokenize

In [12]:
sentences = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."

In [13]:
print(f"sent_tokenize: {sent_tokenize(sentences)}")

sent_tokenize: ['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


## Tokenization with NLTK 

### tagging list
- PRP: personal pronoun
- VBP: verb
- RB: adverb
- VBG: present participle
- IN: preposition
- NNP: proper noun
- NNS: plural noun
- CC: conjunction
- DT: article

In [14]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [15]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
tokenized_sentence = word_tokenize(text)

In [18]:
print('word tokenization :',tokenized_sentence)
print('part of speech tagging :',pos_tag(tokenized_sentence))

word tokenization : ['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']
part of speech tagging : [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]
