# BPE - Subword Tokenization

In [1]:
from transformers import BertTokenizer, GPT2Tokenizer, AutoTokenizer

In [5]:
# Load Pretrained tokenziers

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased") 

gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

auto_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

In [6]:
text1 = 'John is learning deep learning'

In [7]:
print('\n BERT Tokenizer: ', bert_tokenizer.tokenize(text1))
print('\n GPT2 Tokenizer: ', gpt2_tokenizer.tokenize(text1))
print('\n Facebook Auto Tokenizer: ', auto_tokenizer.tokenize(text1))


 BERT Tokenizer:  ['john', 'is', 'learning', 'deep', 'learning']

 GPT2 Tokenizer:  ['John', 'ƒ†is', 'ƒ†learning', 'ƒ†deep', 'ƒ†learning']

 Facebook Auto Tokenizer:  ['‚ñÅJohn', '‚ñÅis', '‚ñÅlearning', '‚ñÅdeep', '‚ñÅlearning']


In [8]:
text2 = 'John is performing unbeliveably great'

print('\n BERT Tokenizer: ', bert_tokenizer.tokenize(text2))
print('\n GPT2 Tokenizer: ', gpt2_tokenizer.tokenize(text2))
print('\n Facebook Auto Tokenizer: ', auto_tokenizer.tokenize(text2))


 BERT Tokenizer:  ['john', 'is', 'performing', 'un', '##beli', '##ve', '##ably', 'great']

 GPT2 Tokenizer:  ['John', 'ƒ†is', 'ƒ†performing', 'ƒ†unbel', 'ive', 'ably', 'ƒ†great']

 Facebook Auto Tokenizer:  ['‚ñÅJohn', '‚ñÅis', '‚ñÅperform', 'ing', '‚ñÅun', 'beli', 've', 'ably', '‚ñÅgreat']


In [9]:
text3 = 'John is really unfriendliness'

print('\n BERT Tokenizer: ', bert_tokenizer.tokenize(text3))
print('\n GPT2 Tokenizer: ', gpt2_tokenizer.tokenize(text3))
print('\n Facebook Auto Tokenizer: ', auto_tokenizer.tokenize(text3))


 BERT Tokenizer:  ['john', 'is', 'really', 'un', '##fr', '##iend', '##lines', '##s']

 GPT2 Tokenizer:  ['John', 'ƒ†is', 'ƒ†really', 'ƒ†unf', 'riend', 'liness']

 Facebook Auto Tokenizer:  ['‚ñÅJohn', '‚ñÅis', '‚ñÅreally', '‚ñÅun', 'friend', 'lines', 's']


In [10]:
text4 = "Hello, this is a test sentence for tokenization. üëãüöÄüî• This includes numbers 123, symbols like $, %, & and some punctuation: !? üòÇüòç"

print('\n BERT Tokenizer: ', bert_tokenizer.tokenize(text4))
print('\n GPT2 Tokenizer: ', gpt2_tokenizer.tokenize(text4))
print('\n Facebook Auto Tokenizer: ', auto_tokenizer.tokenize(text4))


 BERT Tokenizer:  ['hello', ',', 'this', 'is', 'a', 'test', 'sentence', 'for', 'tok', '##eni', '##zation', '.', '[UNK]', 'this', 'includes', 'numbers', '123', ',', 'symbols', 'like', '$', ',', '%', ',', '&', 'and', 'some', 'punct', '##uat', '##ion', ':', '!', '?', '[UNK]']

 GPT2 Tokenizer:  ['Hello', ',', 'ƒ†this', 'ƒ†is', 'ƒ†a', 'ƒ†test', 'ƒ†sentence', 'ƒ†for', 'ƒ†token', 'ization', '.', 'ƒ†√∞≈Åƒ≥', 'ƒ≠', '√∞≈Å', 'ƒº', 'ƒ¢', '√∞≈Å', 'ƒ∂', '¬•', 'ƒ†This', 'ƒ†includes', 'ƒ†numbers', 'ƒ†123', ',', 'ƒ†symbols', 'ƒ†like', 'ƒ†$', ',', 'ƒ†%', ',', 'ƒ†&', 'ƒ†and', 'ƒ†some', 'ƒ†punct', 'uation', ':', 'ƒ†!', '?', 'ƒ†√∞≈Åƒ∫', 'ƒ§', '√∞≈Åƒ∫', 'ƒØ']

 Facebook Auto Tokenizer:  ['‚ñÅHello', ',', '‚ñÅthis', '‚ñÅis', '‚ñÅa', '‚ñÅtest', '‚ñÅsentence', '‚ñÅfor', '‚ñÅto', 'ken', 'ization', '.', '‚ñÅ', 'üëã', 'üöÄ', 'üî•', '‚ñÅThis', '‚ñÅincludes', '‚ñÅnumbers', '‚ñÅ123', ',', '‚ñÅsymbol', 's', '‚ñÅlike', '‚ñÅ$', ',', '‚ñÅ%', ',', '‚ñÅ&', '‚ñÅand', '‚ñÅsome', '‚ñÅpunct', 'u', 'ation', ':', '‚ñÅ!', '

# Sentence Embeddings

In [12]:
from sentence_transformers import SentenceTransformer

In [13]:
stransform = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

In [14]:
stransform.encode('Patient is identified with fever')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


array([-8.68164562e-03,  9.12093222e-02, -4.23275568e-02, -2.38211993e-02,
       -1.75869409e-02, -2.46057678e-02,  7.69438446e-02,  9.65186954e-02,
        1.32826250e-02, -2.06914861e-02, -6.97785057e-03, -3.99118103e-02,
       -6.41570520e-03,  9.54691470e-02, -4.18971032e-02, -5.64137027e-02,
        9.17165074e-03, -5.11935838e-02,  3.48759554e-02, -4.78587262e-02,
       -1.25137612e-01,  1.60899181e-02, -9.93399881e-03, -1.46383075e-02,
       -2.18604710e-02,  4.95761782e-02,  2.74591967e-02,  1.89229753e-02,
        3.04538701e-02,  6.84915036e-02, -3.57959955e-03, -1.59377616e-03,
       -5.05185090e-02,  3.18868645e-02,  4.80021797e-02, -9.70388763e-03,
       -4.48482893e-02,  2.84437928e-02, -6.40439540e-02,  6.92216456e-02,
        1.60486661e-02, -1.35819579e-03,  4.77574021e-02,  1.09385243e-02,
        2.68966500e-02,  1.49014797e-02, -5.92294559e-02,  1.23006068e-01,
        6.56068251e-02, -2.49849092e-02, -5.29667959e-02, -2.64070239e-02,
       -1.83166340e-02,  

In [15]:
s1 = stransform.encode('Patient is identified with fever')
s2 = stransform.encode('The patient was prescribed paracetamol for fever')

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])
                               

Cosine Similarity:  0.69454354


In [18]:
s1 = stransform.encode('John is Fantastic')
s2 = stransform.encode('John is Excellent')
similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])

Cosine Similarity:  0.9153314


In [21]:
s1 = stransform.encode('Good')
s2 = stransform.encode('Bad')
similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])

Cosine Similarity:  0.5871496


In [22]:
s1 = stransform.encode('John is Good')
s2 = stransform.encode('John is Bad')
similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])

Cosine Similarity:  0.84245145


In [29]:
s1 = stransform.encode('good')
s2 = stransform.encode('very bad')

similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])

Cosine Similarity:  0.52174175


In [26]:
s1 = stransform.encode('Jgood')
s2 = stransform.encode('very bad')

s1.shape, s2.shape

((384,), (384,))

In [28]:
s1.reshape(1, -1).shape, s2.reshape(1, -1).shape

((1, 384), (1, 384))

In [30]:
similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))
similarity

array([[0.52174175]], dtype=float32)

In [31]:
similarity[0]

array([0.52174175], dtype=float32)

In [35]:
similarity[0][0]

0.99999994

In [33]:
s1 = stransform.encode('Ranjith is excellent')
s2 = stransform.encode('Ranjith is excellent')

similarity = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

print('Cosine Similarity: ', similarity[0][0])

Cosine Similarity:  0.99999994
