__Tokenization__

Tokenization is the process of breaking down text into individual words or phrases, called tokens, by splitting them based on specific delimiters such as spaces or punctuation marks. It is a fundamental step in natural language processing (NLP) and text analysis. The resulting tokens can then be used for further analysis, such as counting word frequencies or applying machine learning algorithms.

NLTK

TextBlob

CoreNLP

Gensim

SpaCy

Polyglot

Scikit Learn

Pattern


In [2]:
# Split by whitespace
import re
text = 'I\'m with you for the entire life in U.K.!'
words = re.split(r'\W+', text)
print(words[:100])

['I', 'm', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U', 'K', '']


In [3]:
# Select words
words = re.split(r'\W+', text)
print(words[:100])

['I', 'm', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U', 'K', '']


In [4]:
import string
import re
# Split into words by white space
words = text.split()
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
stripped = [re_punc.sub('',w) for w in words]
print(stripped[:100])

['Im', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'UK']


In [5]:
# string.printable inverse of string.punctuation
re_print = re.compile('[^%s]' % re.escape(string.printable))
result = [re_print.sub('', w) for w in words]
print(result)

["I'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U.K.!']


In [6]:
# Normalizing case

# Split into words by white space
words = text.split()
# convert to lower case
words = [word.lower() for word in words]
print(words[:100])

["i'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'u.k.!']


### Spacy

In [15]:
#pip install -U spacy

In [13]:
import spacy

In [33]:
nlp = spacy.load('en_core_web_sm')

In [34]:
string='"I\'m with you for the entire life in P.K.!"'
print(string)

"I'm with you for the entire life in P.K.!"


In [32]:
# !python -m spacy download en_core_web_sm


In [35]:
doc = nlp(string)
for token in doc:
    print(token.text, end=' | ')

" | I | 'm | with | you | for | the | entire | life | in | P.K. | ! | " | 

In [39]:
doc2 = nlp(u"We're here to help! Send snail-mail, email fahad@gmail.com or visit us at link")
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
fahad@gmail.com
or
visit
us
at
link


In [40]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [41]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [42]:
len(doc)

13

In [43]:
len(doc.vocab)

797

In [44]:
doc5 = nlp(u'It is better to give than to receive.')
# Retriveve the third token:
doc5[2]

better

In [45]:
# Retrieve three tokens from the middle:
doc5[2:5]

better to give

In [46]:
doc5[-4]

than

In [47]:
doc6=nlp(u'My dinner was horrible.')
doc7=nlp(u'Your dinner was delicious.')

In [48]:
# Try to change 'My dinner was horrible' to 'My dinner was delicious'
doc6[3]=doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [50]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
for token in doc8:
    print(token.text, end=' | ')
    
print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [51]:
len(doc8.ents)

3

In [52]:
doc9 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for chunk in doc9.noun_chunks:
    print(chunk.text)

Apple
a Hong Kong factory


In [53]:
from spacy import displacy

doc10 = nlp(u'Apple to build a Hong Kong factory for $6 million')
displacy.render(doc10, style='dep', jupyter=True, options={'distance':110})

In [56]:
doc11 = nlp(u'Apple to build a Hong Kong factory for $6 million')
displacy.render(doc11, style='ent', jupyter=True)

In [None]:
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

