In [1]:
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#The sentence
#txt = "I need to visit the bank tomorrow; after that, we'll set up a tent by the river bank, just across from the bank where my friend works."
#txt = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
txt = "I need to visit the financial bank tomorrow; after that, we'll set up a tent by the river bank, just across from the bank building where my friend works."

#Add the special tokens
wrangled_txt = '[CLS] ' + txt + ' [SEP]'

#tokenization
tokenized_txt = tokenizer.tokenize(wrangled_txt)

print(tokenized_txt)

['[CLS]', 'i', 'need', 'to', 'visit', 'the', 'financial', 'bank', 'tomorrow', ';', 'after', 'that', ',', 'we', "'", 'll', 'set', 'up', 'a', 'tent', 'by', 'the', 'river', 'bank', ',', 'just', 'across', 'from', 'the', 'bank', 'building', 'where', 'my', 'friend', 'works', '.', '[SEP]']


In [3]:
#get the ids of the tokens
ids_tokens = tokenizer.convert_tokens_to_ids(tokenized_txt)

#Display the tokens
for t in zip(tokenized_txt, ids_tokens):
    print('{:<12} {:>8,}'.format(t[0], t[1]))

[CLS]             101
i               1,045
need            2,342
to              2,000
visit           3,942
the             1,996
financial       3,361
bank            2,924
tomorrow        4,826
;               1,025
after           2,044
that            2,008
,               1,010
we              2,057
'               1,005
ll              2,222
set             2,275
up              2,039
a               1,037
tent            9,311
by              2,011
the             1,996
river           2,314
bank            2,924
,               1,010
just            2,074
across          2,408
from            2,013
the             1,996
bank            2,924
building        2,311
where           2,073
my              2,026
friend          2,767
works           2,573
.               1,012
[SEP]             102


In [4]:
segments_ids = [1] * len(tokenized_txt)
#Convert the token IDs and segment IDs into tensors.

token_tensor = torch.tensor([ids_tokens])
segment_tensor = torch.tensor([segments_ids])

In [5]:
# Load pre-trained model with the weights
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, return_dict = False)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [14]:
#https://huggingface.co/docs/transformers/model_doc/bert#bertmodel
#The input is of the shape (batch_size, sequence_length)
#Compute the output
with torch.no_grad():
    outputs = model(token_tensor, segment_tensor)
hidden_states = outputs[2]

In [15]:
#The first one is initial embeddings
print ("Number of layers:", len(hidden_states))
layer_ptr = 0

print ("Number of batches:", len(hidden_states[layer_ptr]))
batch_ptr = 0

print ("Number of tokens:", len(hidden_states[layer_ptr][batch_ptr]))
token_ptr = 0

print ("Number of hidden units:", len(hidden_states[layer_ptr][batch_ptr][token_ptr]))

Number of layers: 13
Number of batches: 1
Number of tokens: 37
Number of hidden units: 768


In [16]:
#Concatenate all the layers
token_embeddings = torch.stack(hidden_states, dim=0)

#remove the batch dimension
token_embeddings = torch.squeeze(token_embeddings, dim=1)
print(token_embeddings.shape)

torch.Size([13, 37, 768])


In [17]:
# Swap dimensions 0 and 1 so that each word contains the 13 layer hidden states
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([37, 13, 768])

In [18]:
#sum the last four layers
token_vectors_sum = []

# token_embeddings is a [35 x 13 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vector = torch.sum(token[-4:], dim=0)

    # Use `sum_vec` to represent `token`.
    token_vectors_sum.append(sum_vector)

print ('Shape is: %d x %d' % (len(token_vectors_sum), len(token_vectors_sum[0])))


Shape is: 37 x 768


In [19]:
#Display the token
for i, t in enumerate(tokenized_txt):
  print (i, t)

0 [CLS]
1 i
2 need
3 to
4 visit
5 the
6 financial
7 bank
8 tomorrow
9 ;
10 after
11 that
12 ,
13 we
14 '
15 ll
16 set
17 up
18 a
19 tent
20 by
21 the
22 river
23 bank
24 ,
25 just
26 across
27 from
28 the
29 bank
30 building
31 where
32 my
33 friend
34 works
35 .
36 [SEP]


In [21]:
token_vectors = torch.stack(token_vectors_sum)

In [25]:
#compare the word bank in 7, 23, and 29
#txt = "I need to visit the financial bank tomorrow; after that, we'll set up a tent by the river bank, just across from the bank building where my friend works."

same_bank_word = 1 - cosine(token_vectors[7], token_vectors[29])
diff_bank_word1 = 1 - cosine(token_vectors[7], token_vectors[23])
diff_bank_word2 = 1 - cosine(token_vectors[23], token_vectors[29])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank_word)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank_word1)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank_word2)

Vector similarity for  *similar*  meanings:  0.78
Vector similarity for *different* meanings:  0.66
Vector similarity for *different* meanings:  0.68


## Using AWS Bedrock with DeepSeek

- passing the prompt to the model for a similar AI LLM based task
- this compares later to the bert pipeline for filling in the blanks
- is really just exploratory on how embedding models work, and how they along with some AI techniques can be used for various NLP tasks in `getout-of-text3`

### examples

- In finding the ordinary meaning of words, namely the ambiguous text of importance in a statutory interpretation that is up for debate, there are various techniques we can employ to disambiguate the text and extract its intended meaning, including tradition KWIC (COCA), Embedding (LEGAL-BERT), and AI LLMs (DeepSeek on AWS Bedrock)

In [112]:
import boto3

session = boto3.Session(profile_name='atn-developer')

bedrock = session.client(
    "bedrock-runtime",
    region_name="us-east-1"
)

response = bedrock.invoke_model(
    modelId="us.deepseek.r1-v1:0",
    contentType="application/json",
    accept="application/json",
    body='{"prompt": "Please analyze the masked sentence to fill the mask: \\"To modify means we should [MASK] significant changes.\\"", "max_tokens": 256}',
)

deepseek = response['body'].read()

In [121]:
import json

deepseek_dict = json.loads(deepseek.decode())
#print(deepseek_dict)

#deepseek_dict['choices'][0]['text']
# print with line wrap
# I want to print with a line break in the ['text']
text = deepseek_dict['choices'][0]['text']
for i in range(0, len(text), 100):
    print(text[i:i+100])

 Please provide the answer in the format: [answer] with the most appropriate word to replace [MASK].


Okay, let's see. The sentence is "To modify means we should [MASK] significant changes." I need to
 find the right word to replace [MASK]. The sentence is explaining what "modify" means. So, when you
 modify something, you make changes to it. The verb that goes with "changes" here is probably "make"
. Like, "make changes" is a common collocation. Let me think of other possibilities. Maybe "implemen
t"? But "implement changes" is also correct, but does it fit the context? The sentence is defining "
modify", so the word should be a synonym of "modify" in the context of causing changes. "Make" is mo
re direct and common. "Create" could work, but "make" is more usual. "Introduce" is possible too, bu
t again, "make" is simpler and more likely. So the best answer is probably "make". Let me check agai
n. "To modify means we should make significant changes." Yes, that makes sense. The other o

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="nlpaueb/legal-bert-base-uncased")

Device set to use mps:0


In [None]:
pipe("To modify means we should [MASK] significant changes")

[{'score': 0.34469074010849,
  'token': 1343,
  'token_str': 'vehicle',
  'sequence': 'the car is a vehicle that is not permitted in the park.'},
 {'score': 0.09207320958375931,
  'token': 355,
  'token_str': 'use',
  'sequence': 'the car is a use that is not permitted in the park.'},
 {'score': 0.050254397094249725,
  'token': 2373,
  'token_str': 'sign',
  'sequence': 'the car is a sign that is not permitted in the park.'},
 {'score': 0.029583115130662918,
  'token': 446,
  'token_str': 'service',
  'sequence': 'the car is a service that is not permitted in the park.'},
 {'score': 0.017440086230635643,
  'token': 1645,
  'token_str': 'car',
  'sequence': 'the car is a car that is not permitted in the park.'}]

In [74]:
pipe("That bike is a [MASK] that is not permitted in the park.")

[{'score': 0.1371982991695404,
  'token': 4672,
  'token_str': 'commodity',
  'sequence': 'that bike is a commodity that is not permitted in the park.'},
 {'score': 0.08565253764390945,
  'token': 4175,
  'token_str': 'game',
  'sequence': 'that bike is a game that is not permitted in the park.'},
 {'score': 0.07633555680513382,
  'token': 424,
  'token_str': 'product',
  'sequence': 'that bike is a product that is not permitted in the park.'},
 {'score': 0.0717778131365776,
  'token': 446,
  'token_str': 'service',
  'sequence': 'that bike is a service that is not permitted in the park.'},
 {'score': 0.062396444380283356,
  'token': 1343,
  'token_str': 'vehicle',
  'sequence': 'that bike is a vehicle that is not permitted in the park.'}]