### [Youtube tutorial by Bhavesh Bhatt](https://www.youtube.com/watch?v=1W-sWmFQPZY)
# BERT tutorial

In [None]:
# installing one nlp library
!pip install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/f0/3a/1b46a0220d6176b22bcb9336619d1731301bc2c75fa926a9ef953e6e4d58/flair-0.8.0.post1-py3-none-any.whl (284kB)
[K     |█▏                              | 10kB 14.6MB/s eta 0:00:01[K     |██▎                             | 20kB 20.7MB/s eta 0:00:01[K     |███▌                            | 30kB 10.7MB/s eta 0:00:01[K     |████▋                           | 40kB 8.3MB/s eta 0:00:01[K     |█████▊                          | 51kB 5.3MB/s eta 0:00:01[K     |███████                         | 61kB 5.5MB/s eta 0:00:01[K     |████████                        | 71kB 5.7MB/s eta 0:00:01[K     |█████████▏                      | 81kB 6.1MB/s eta 0:00:01[K     |██████████▍                     | 92kB 6.0MB/s eta 0:00:01[K     |███████████▌                    | 102kB 6.5MB/s eta 0:00:01[K     |████████████▋                   | 112kB 6.5MB/s eta 0:00:01[K     |█████████████▉                  | 122kB 6.5MB/s et

In [None]:
# importing required libraries
import numpy as np
from flair.embeddings import WordEmbeddings
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence
from scipy.spatial import distance


In [None]:
# one word vector for each word; this are context independent
# This means, a word, even if it has occurred in different sentences with different meanings will get the same numerical representation
# we use here glove embeddings
glove_embedding = WordEmbeddings('glove')

In [None]:
# this word to vec model is context independent
sent_1 = Sentence("oneplus released oneplus x pro in 2021")

# now we will convert the word into vector representation using the glove model
glove_embedding.embed(sent_1)

[Sentence: "oneplus released oneplus x pro in 2021"   [− Tokens: 7]]

In [None]:
# now let us see each word in the form of vector
# representing each word in the form of vector
for token in sent_1:
    print(token)
    print(token.embedding)
    print("\n")

Token: 1 oneplus
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


Token: 2 released
tensor([-0.0451, -0.7260,  0.6271, -0.8506,  1.2913,  0.1556,  0.5372,  0.2967,
         0.0272, -0.2156,  0.7911,  0.1594,  0.5834,  0.1422,  0.2127,  0.5273,
         0.8467, -0.2552, -0.2266,  0.0687,  0.8454, -1.0389,  0.0572,  0.7512,
        -0.4233,  0.3479,  0.3719, -0.0245,  0.4516,  1.0017,  0.7148,  0.0396,
        -0.7914,  1.1237,  0.0218,  0.5682, -0.1066,  0.4893, -0.1981, -0.9888,
        -0.2954,  0.2199, -0.1736,  1.0455,  0.3021, -0.5135, -0.4400, -0.8309,
         0.4581, -0.4946,  0.3384, -0.4309

In [None]:
# shape of each word vector
sent_1[0].embedding.shape

torch.Size([100])

In [None]:
# Let us create another sentence with similar words
sent_2 = Sentence("oneplus Oneplus one is three")

# converting word to vector
glove_embedding.embed(sent_2)


[Sentence: "oneplus Oneplus one is three"   [− Tokens: 5]]

In [None]:
distance.euclidean(np.array(sent_2[0].embedding), np.array(sent_2[1].embedding)) # there is no difference between 'oneplus' and 'Oneplus'

0.0

In [None]:


# let us visualize the vector representation of words
for token in sent_2:
    print(token)
    print(token.embedding)
    print("\n")


Token: 1 oneplus
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


Token: 2 Oneplus
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])


Token: 3 one
tensor([-0.2256,  0.4942,  0.4861, -0.4332,  0.1374,  0.5062,  0.26

In [None]:
# shape of each word
sent_2[0].embedding.shape

torch.Size([100])

In [None]:
# now let us see if there any different among the tensors of word 'oneplue' with different contexts

glove_dst = distance.euclidean(np.array(sent_1[0].embedding), np.array(sent_2[0].embedding))
print(glove_dst)

# So in word2vec, word embedding doesn't change with context

0.0


# Bert Embeddings

In [None]:


# downloading the bert model
bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

# embedding sentence 1
bert_embedding.embed(sent_1)
for token in sent_1:
    print(token)
    print(token.embedding)



In [None]:
# let us see the tensors for each word embeddings
sent_1[0].embedding.shape

# tensors are larger in size than word embedding

torch.Size([868])

In [None]:
# embedding the second sentence
bert_embedding.embed(sent_2)

for token in sent_2:
    print(token)
    print(token.embedding)



In [None]:
sent_2[0].embedding.shape


torch.Size([868])

In [None]:
# let us see if there is any difference between the the two 'oneplus' used in two different contexts
bert_dst = distance.euclidean(np.array(sent_1[0].embedding), np.array(sent_2[0].embedding))
print(bert_dst)

# This shows bert embeddings captures contexts

7.014278411865234
