#Tokenization

In [None]:
text = """Mr. Santa was busy today; But he didn't have that much work. "Hii, how are you doing? good work today!", he said."""

text2 = "such a long sentence that it proba-\
bly won't fit in a single line, but here's an intra-word hyphen too! These words will not be seperated: combinedWords combinedlongwords"

text3 = "Hiii! :D how are y'all doin' 😀. #DailyTweet @DailyTweeter full-stops.... and a link!!? https://imgur.com/gallery/y8u0gg8. In comparision to:these f123@gmail.com a word i made up is ran'om cuz i cant spel"

text_hi = "राम-श्याम बहुत ही सामान्य नाम हैं। शनैः शनैः दिन ढल गया। अतः यह सिद्ध होता है कि सूर्य पूर्व निकलता है।"

## Python and Regex

In [None]:
text.split() 

['Mr.',
 'Santa',
 'was',
 'busy',
 'today;',
 'But',
 'he',
 "didn't",
 'have',
 'that',
 'much',
 'work.',
 '"Hii,',
 'how',
 'are',
 'you',
 'doing?',
 'good',
 'work',
 'today!",',
 'he',
 'said.']

In [None]:
text.split(". ") 

['Mr',
 "Santa was busy today; But he didn't have that much work",
 '"Hii, how are you doing? good work today!", he said.']

In [None]:
import re
rx = r"\w+(?:'\w+)?|[^\w\s]"

print(re.findall(rx, text))
print(re.findall(rx, text2)) #notice "didn't" is a single word

['Mr', '.', 'Santa', 'was', 'busy', 'today', ';', 'But', 'he', "didn't", 'have', 'that', 'much', 'work', '.', '"', 'Hii', ',', 'how', 'are', 'you', 'doing', '?', 'good', 'work', 'today', '!', '"', ',', 'he', 'said', '.']
['such', 'a', 'long', 'sentence', 'that', 'it', 'proba', '-', 'bly', "won't", 'fit', 'in', 'a', 'single', 'line', ',', 'but', "here's", 'an', 'intra', '-', 'word', 'hyphen', 'too', '!', 'These', 'words', 'will', 'not', 'be', 'seperated', ':', 'combinedWords', 'combinedlongwords']


## Libraries Discussed:
- nltk (fast, and useful)
- spacy (very useful, plenty of features)
- stanza (good language support)
- indic-nlp-library
- huggingFace tokenizer

# NLTK

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
  
print(sent_tokenize(text))
print(word_tokenize(text), "\n")

print(sent_tokenize(text2))
print(word_tokenize(text2), "\n")

print(sent_tokenize(text3))
print(word_tokenize(text3)) #cant is single word here, link is seperated

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
["Mr. Santa was busy today; But he didn't have that much work.", '"Hii, how are you doing?', 'good work today!', '", he said.']
['Mr.', 'Santa', 'was', 'busy', 'today', ';', 'But', 'he', 'did', "n't", 'have', 'that', 'much', 'work', '.', '``', 'Hii', ',', 'how', 'are', 'you', 'doing', '?', 'good', 'work', 'today', '!', '``', ',', 'he', 'said', '.'] 

["such a long sentence that it proba-bly won't fit in a single line, but here's an intra-word hyphen too!", 'These words will not be seperated: combinedWords combinedlongwords']
['such', 'a', 'long', 'sentence', 'that', 'it', 'proba-bly', 'wo', "n't", 'fit', 'in', 'a', 'single', 'line', ',', 'but', 'here', "'s", 'an', 'intra-word', 'hyphen', 'too', '!', 'These', 'words', 'will', 'not', 'be', 'seperated', ':', 'combinedWords', 'combinedlongwords'] 

['Hiii!', ":D how are y'all doin' 😀.", '#DailyTweet @DailyTweeter full-stops.... and a l

#Spacy

In [None]:
!pip install -U spacy
!python -m spacy download en

Collecting spacy
  Downloading spacy-3.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.2 MB/s 
Collecting pathy>=0.3.5
  Downloading pathy-0.6.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
[?25hCollecting catalogue<2.1.0,>=2.0.4
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.7
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)
[K     |████████████████████████████████| 456 kB 49.9 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 26.6 MB/s 
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.8
  Downloading thinc-8.0.10-cp37-cp37m-m

In [None]:
# Load tokenizer, tagger, parser, NER and word vectors
from spacy.lang.hi import Hindi
from spacy.lang.en import English 

In [None]:
nlp = English()
nlp.add_pipe('sentencizer')


#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)
my_doc2 = nlp(text2)
my_doc3 = nlp(text3)

for sent in my_doc.sents:
    print(sent.text)
for sent in my_doc2.sents:
    print(sent.text)

Mr. Santa was busy today; But he didn't have that much work. "
Hii, how are you doing?
good work today!",
he said.
such a long sentence that it proba-bly won't fit in a single line, but here's an intra-word hyphen too!
These words will not be seperated: combinedWords combinedlongwords


In [None]:
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['Mr.', 'Santa', 'was', 'busy', 'today', ';', 'But', 'he', 'did', "n't", 'have', 'that', 'much', 'work', '.', '"', 'Hii', ',', 'how', 'are', 'you', 'doing', '?', 'good', 'work', 'today', '!', '"', ',', 'he', 'said', '.']


In [None]:
token_list = []
for token in my_doc2:
    token_list.append(token.text)
print(token_list)

['such', 'a', 'long', 'sentence', 'that', 'it', 'proba', '-', 'bly', 'wo', "n't", 'fit', 'in', 'a', 'single', 'line', ',', 'but', 'here', "'s", 'an', 'intra', '-', 'word', 'hyphen', 'too', '!', 'These', 'words', 'will', 'not', 'be', 'seperated', ':', 'combinedWords', 'combinedlongwords']


In [None]:
token_list = []
for token in my_doc3:
    token_list.append(token.text)
token_list

['Hiii',
 '!',
 ':D',
 'how',
 'are',
 "y'",
 'all',
 "doin'",
 '😀',
 '.',
 '#',
 'DailyTweet',
 '@DailyTweeter',
 'full',
 '-',
 'stops',
 '....',
 'and',
 'a',
 'link',
 '!',
 '!',
 '?',
 'https://imgur.com/gallery/y8u0gg8',
 '.',
 'In',
 'comparision',
 'to',
 ':',
 'these',
 'f123@gmail.com',
 'a',
 'word',
 'i',
 'made',
 'up',
 'is',
 "ran'om",
 'cuz',
 'i',
 'ca',
 'nt',
 'spel']

In [None]:
nlp = Hindi()
nlp.add_pipe('sentencizer')

my_doc_hi = nlp(text_hi)
for sent in my_doc_hi.sents:
    print(sent.text)

राम-श्याम बहुत ही सामान्य नाम हैं।
शनैः शनैः दिन ढल गया।
अतः यह सिद्ध होता है कि सूर्य पूर्व निकलता है।


In [None]:
token_list = []
for token in my_doc_hi:
    token_list.append(token.text)
print(token_list)

['राम', '-', 'श्याम', 'बहुत', 'ही', 'सामान्य', 'नाम', 'हैं', '।', 'शनैः', 'शनैः', 'दिन', 'ढल', 'गया', '।', 'अतः', 'यह', 'सिद्ध', 'होता', 'है', 'कि', 'सूर्य', 'पूर्व', 'निकलता', 'है', '।']


# Indic-nlp

In [None]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.81-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 3.1 MB/s 
Collecting sphinx-argparse
  Downloading sphinx_argparse-0.3.1-py2.py3-none-any.whl (12 kB)
Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 7.0 MB/s 
Collecting docutils>=0.11
  Downloading docutils-0.16-py2.py3-none-any.whl (548 kB)
[K     |████████████████████████████████| 548 kB 69.4 MB/s 
Installing collected packages: docutils, sphinx-rtd-theme, sphinx-argparse, morfessor, indic-nlp-library
  Attempting uninstall: docutils
    Found existing installation: docutils 0.17.1
    Uninstalling docutils-0.17.1:
      Successfully uninstalled docutils-0.17.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. Th

In [None]:
from indicnlp.tokenize import indic_tokenize  
indic_string= text_hi
indic_tokenize.trivial_tokenize(indic_string)

['राम',
 '-',
 'श्याम',
 'बहुत',
 'ही',
 'सामान्य',
 'नाम',
 'हैं',
 '।',
 'शनैः',
 'शनैः',
 'दिन',
 'ढल',
 'गया',
 '।',
 'अतः',
 'यह',
 'सिद्ध',
 'होता',
 'है',
 'कि',
 'सूर्य',
 'पूर्व',
 'निकलता',
 'है',
 '।']

# Stanza

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.2.3-py3-none-any.whl (342 kB)
[?25l[K     |█                               | 10 kB 20.1 MB/s eta 0:00:01[K     |██                              | 20 kB 25.8 MB/s eta 0:00:01[K     |██▉                             | 30 kB 13.4 MB/s eta 0:00:01[K     |███▉                            | 40 kB 10.1 MB/s eta 0:00:01[K     |████▉                           | 51 kB 5.3 MB/s eta 0:00:01[K     |█████▊                          | 61 kB 5.9 MB/s eta 0:00:01[K     |██████▊                         | 71 kB 5.6 MB/s eta 0:00:01[K     |███████▋                        | 81 kB 6.3 MB/s eta 0:00:01[K     |████████▋                       | 92 kB 4.8 MB/s eta 0:00:01[K     |█████████▋                      | 102 kB 5.1 MB/s eta 0:00:01[K     |██████████▌                     | 112 kB 5.1 MB/s eta 0:00:01[K     |███████████▌                    | 122 kB 5.1 MB/s eta 0:00:01[K     |████████████▌                   | 133 kB 5.1 MB/s eta 0:00:01[K 

In [None]:
import stanza
stanza.download('en')
stanza.download('hi')
stanza.download('ko')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-09-13 12:13:41 INFO: Downloading default packages for language: ko (Korean)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.2/ko/default.zip:   0%|          | 0.00/212M [00:00<?,…

2021-09-13 12:14:22 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
stanza.download('ja')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-09-13 05:31:36 INFO: Downloading default packages for language: ja (Japanese)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.2/ja/default.zip:   0%|          | 0.00/205M [00:00<?,…

2021-09-13 05:32:16 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlpEn = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlpEn(text) 
doc2 = nlpEn(text2)
doc3 = nlpEn(text3)
for i, sentence in enumerate(doc.sentences):
    print(sentence.text)
for i, sentence in enumerate(doc2.sentences):
    print(sentence.text)

2021-09-13 05:44:48 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-09-13 05:44:48 INFO: Use device: cpu
2021-09-13 05:44:48 INFO: Loading: tokenize
2021-09-13 05:44:48 INFO: Done loading processors!


Mr. Santa was busy today;
But he didn't have that much work.
"Hii, how are you doing?
good work today!", he said.
such a long sentence that it proba-bly won't fit in a single line, but here's an intra-word hyphen too!
These words will not be seperated: combined
Words combinedlongwords


In [None]:
for i, sentence in enumerate(doc2.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

id: (1,)	text: such
id: (2,)	text: a
id: (3,)	text: long
id: (4,)	text: sentence
id: (5,)	text: that
id: (6,)	text: it
id: (7,)	text: proba-bly
id: (8,)	text: wo
id: (9,)	text: n't
id: (10,)	text: fit
id: (11,)	text: in
id: (12,)	text: a
id: (13,)	text: single
id: (14,)	text: line
id: (15,)	text: ,
id: (16,)	text: but
id: (17,)	text: here
id: (18,)	text: 's
id: (19,)	text: an
id: (20,)	text: intra-word
id: (21,)	text: hyphen
id: (22,)	text: too
id: (23,)	text: !
id: (1,)	text: These
id: (2,)	text: words
id: (3,)	text: will
id: (4,)	text: not
id: (5,)	text: be
id: (6,)	text: seperated
id: (7,)	text: :
id: (8,)	text: combined
id: (1,)	text: Words
id: (2,)	text: combinedlongwords


In [None]:
for i, sentence in enumerate(doc3.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

id: (1,)	text: Hiii
id: (2,)	text: !
id: (3,)	text: :
id: (4,)	text: D
id: (5,)	text: how
id: (6,)	text: are
id: (7,)	text: y'all
id: (8,)	text: doin
id: (9,)	text: '
id: (10,)	text: 😀.
id: (11,)	text: #
id: (12,)	text: Daily
id: (1,)	text: Tweet
id: (2,)	text: @
id: (3,)	text: Daily
id: (1,)	text: Tweeter
id: (2,)	text: full-stops
id: (3,)	text: ....
id: (4,)	text: and
id: (5,)	text: a
id: (6,)	text: link
id: (7,)	text: !!?
id: (1,)	text: https://imgur.com/gallery/y8u0gg8.
id: (1,)	text: In
id: (2,)	text: comparision
id: (3,)	text: to
id: (4,)	text: :
id: (5,)	text: these
id: (6,)	text: f123@gmail.com
id: (7,)	text: a
id: (8,)	text: word
id: (9,)	text: i
id: (10,)	text: made
id: (11,)	text: up
id: (12,)	text: is
id: (13,)	text: ran
id: (14,)	text: 'om
id: (15,)	text: cuz
id: (16,)	text: i
id: (17,)	text: cant
id: (18,)	text: spel


In [None]:
nlpHi = stanza.Pipeline(lang='hi', processors='tokenize')
doc = nlpHi(text_hi)
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

2021-09-13 05:44:52 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

2021-09-13 05:44:52 INFO: Use device: cpu
2021-09-13 05:44:52 INFO: Loading: tokenize
2021-09-13 05:44:52 INFO: Done loading processors!


id: (1,)	text: राम-श्याम
id: (2,)	text: बहुत
id: (3,)	text: ही
id: (4,)	text: सामान्य
id: (5,)	text: नाम
id: (6,)	text: हैं
id: (7,)	text: ।
id: (1,)	text: शनैः
id: (2,)	text: शनैः
id: (3,)	text: दिन
id: (4,)	text: ढल
id: (5,)	text: गया
id: (6,)	text: ।
id: (1,)	text: अतः
id: (2,)	text: यह
id: (3,)	text: सिद्ध
id: (4,)	text: होता
id: (5,)	text: है
id: (6,)	text: कि
id: (7,)	text: सूर्य
id: (8,)	text: पूर्व
id: (9,)	text: निकलता
id: (10,)	text: है
id: (11,)	text: ।


In [None]:
nlpJa = stanza.Pipeline(lang='ja', processors='tokenize')
doc = nlpJa('これは小さな文章です。これは別の小さな文です.')
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

2021-09-13 05:48:11 INFO: Loading these models for language: ja (Japanese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |

2021-09-13 05:48:11 INFO: Use device: cpu
2021-09-13 05:48:11 INFO: Loading: tokenize
2021-09-13 05:48:11 INFO: Done loading processors!


id: (1,)	text: これ
id: (2,)	text: は
id: (3,)	text: 小さな
id: (4,)	text: 文章
id: (5,)	text: です
id: (6,)	text: 。
id: (1,)	text: これ
id: (2,)	text: は
id: (3,)	text: 別
id: (4,)	text: の
id: (5,)	text: 小さな
id: (6,)	text: 文
id: (7,)	text: です
id: (8,)	text: .


In [None]:
nlpKo = stanza.Pipeline(lang='ko', processors='tokenize')
doc = nlpKo('이것은 문장입니다. 이것은 다른 문장입니다.')
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n') # 입니다 can be further tokenized

2021-09-13 12:54:11 INFO: Loading these models for language: ko (Korean):
| Processor | Package |
-----------------------
| tokenize  | kaist   |

2021-09-13 12:54:11 INFO: Use device: cpu
2021-09-13 12:54:11 INFO: Loading: tokenize
2021-09-13 12:54:11 INFO: Done loading processors!


id: (1,)	text: 이것은
id: (2,)	text: 문장입니다
id: (3,)	text: .
id: (1,)	text: 이것은
id: (2,)	text: 다른
id: (3,)	text: 문장입니다
id: (4,)	text: .


# Maximum matching algorithm



1. Start with first character of the given string.
2. Search the longest word in list starting with this character.
3.  *If* match is found, boundary is marked. *Else*, character is treated as word.




In [None]:
from random import sample
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
from nltk.corpus import words

lowercaseCorpus = [x.lower() for x in words.words()]
print(len(lowercaseCorpus))
print(sample(lowercaseCorpus,10))

236736
['spirula', 'waitress', 'rabbinate', 'phasotropy', 'dorsabdominal', 'undecipher', 'upwall', 'undone', 'valvulotome', 'demipronation']


In [None]:
def maxMatch(bigword):
  tokens = []
  i = 0
  while i < len(bigword):
      maxWord = ""
      for j in range(i, len(bigword)):
          tempWord = bigword[i:j+1]
          if tempWord in lowercaseCorpus and len(tempWord) > len(maxWord):
              maxWord = tempWord
      i = i+len(maxWord)
      tokens.append(maxWord)

  print(tokens)

In [None]:
maxMatch("combinedwordhereforthealgorithm")

['combined', 'word', 'here', 'forth', 'ea', 'l', 'gor', 'it', 'h', 'm']


# Hugging face 
Bert-WordPiece

In [None]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3


In [None]:
from tokenizers import BertWordPieceTokenizer
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt

--2021-09-12 10:00:55--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.78.126
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.78.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘bert-base-uncased-vocab.txt’


2021-09-12 10:00:55 (3.96 MB/s) - ‘bert-base-uncased-vocab.txt’ saved [231508/231508]



In [None]:
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)

In [None]:
output = tokenizer.encode_batch([text, text2, text3])
print(output[2].tokens)

['[CLS]', 'hi', '##ii', '!', ':', 'd', 'how', 'are', 'y', "'", 'all', 'doin', "'", '[UNK]', '.', '#', 'daily', '##t', '##wee', '##t', '@', 'daily', '##t', '##wee', '##ter', 'full', '-', 'stops', '.', '.', '.', '.', 'and', 'a', 'link', '!', '!', 'https', ':', '/', '/', 'im', '##gur', '.', 'com', '/', 'gallery', '/', 'y', '##8', '##u', '##0', '##gg', '##8', '.', 'in', 'com', '##par', '##ision', 'to', ':', 'these', 'f1', '##23', '@', 'gma', '##il', '.', 'com', 'a', 'word', 'i', 'made', 'up', 'is', 'ran', "'", 'om', 'cu', '##z', 'i', 'can', '##t', 'sp', '##el', '[SEP]']


In [None]:
output = tokenizer.encode_batch(["averybigcombinedwordforthealgorithm"])
print(output[0].tokens)

['[CLS]', 'avery', '##bi', '##gc', '##om', '##bine', '##d', '##word', '##forth', '##eal', '##gor', '##ith', '##m', '[SEP]']


What to do if you do not have any module for your language? i.e. a Low resource languages. 
- Train your own tokenizer, hugging face makes it very easy: 
https://huggingface.co/blog/how-to-train