## Bert as Embeding: Practice

In [85]:
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader


In [35]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [116]:
# Sample

#sentence = 'hehidden likes to play'
sentence = data['text1'][0]
print(sentence)

max_len = 256

# # Step 1: Tokenize
tokens = tokenizer.tokenize(sentence)
print(f"\nstep 1: Tokenize \n{tokens}")

# # Step 2: Add [CLS] and [SEP]
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(f"\nstep 2: adding special char: \n{tokens}")

# # Step 3: Pad tokens
if len(tokens) < max_len:
    padded_tokens = tokens + ['[PAD]' for _ in range(max_len - len(tokens))]
else:
    padded_tokens = tokens[:max_len-1] + ['[SEP]']
print(f"\nstep 3: Padding: \n{padded_tokens}")
      
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(f"\nstep 3: Attention Masking: \n{padded_tokens}")  # Only give attention to non padded sentx

# # Step 4: Segment ids
seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
print(f"\nstep 4: Segment ids:\n{seg_ids}") # Distinguish one sent to another (use case: question from paragraph)

# # Step 5: Get BERT vocabulary index for each token
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print(f"\nstep 5: BERT vocabulary index for each token:\n{token_ids}") 

savvy searchers fail to spot ads internet search engine users are an odd mix of naive and sophisticated  suggests a report into search habits.  the report by the us pew research center reveals that 87% of searchers usually find what they were looking for when using a search engine. it also shows that few can spot the difference between paid-for results and organic ones. the report reveals that 84% of net users say they regularly use google  ask jeeves  msn and yahoo when online.  almost 50% of those questioned said they would trust search engines much less  if they knew information about who paid for results was being hidden. according to figures gathered by the pew researchers the average users spends about 43 minutes per month carrying out 34 separate searches and looks at 1.9 webpages for each hunt. a significant chunk of net users  36%  carry out a search at least weekly and 29% of those asked only look every few weeks. for 44% of those questioned  the information they are looking 

In [118]:
# # Convert to pytorch tensors
token_ids = torch.tensor(token_ids).unsqueeze(0)
attn_mask = torch.tensor(attn_mask).unsqueeze(0)
seg_ids = torch.tensor(seg_ids).unsqueeze(0)

# # Feed them to bert
hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,\
                                   token_type_ids = seg_ids)
print(hidden_reps.shape)
print(cls_head.shape)

torch.Size([1, 256, 768])
torch.Size([1, 768])


## Dataset Class and Data Loaders

#### ETL process --> convert data into perticular format

In [86]:
class LoadDataset(Dataset):

    def __init__(self, filename, maxlen):

        # Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter=',')

        # Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define the Maxlength for padding/truncating
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text1']

        # Tokenize the sentence
        tokens = self.tokenizer.tokenize(sentence)

        # Inserting the CLS and SEP token at the beginning and end of the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        # Padding/truncating the sentences to the maximum length
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        
        # Convert the sequence to ids with BERT Vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        # Converting the list to a pytorch tensor
        tokens_ids_tensor = torch.tensor(tokens_ids)

        # Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask

In [111]:
# Creating instances of training and validation set
dataset_text1 = LoadDataset(filename = 'Text_Similarity_Dataset.csv', maxlen = 256)

# Creating intsances of training and validation dataloaders
train_loader = DataLoader(dataset_text1, batch_size = 32, num_workers = 5)

In [5]:
import mxnet as mx
from bert_embedding import BertEmbedding


ctx = mx.gpu(0)
bert = BertEmbedding(ctx=ctx)

ModuleNotFoundError: No module named 'bert_embedding'

In [1]:

bert_abstract = """We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.
 Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.
 As a result, the pre-trained BERT representations can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. 
BERT is conceptually simple and empirically powerful. 
It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE benchmark to 80.4% (7.6% absolute improvement), MultiNLI accuracy to 86.7 (5.6% absolute improvement) and the SQuAD v1.1 question answering Test F1 to 93.2 (1.5% absolute improvement), outperforming human performance by 2.0%."""
sentences = bert_abstract.split('\n')
bert_embedding = BertEmbedding()
result = bert_embedding(sentences)

In [3]:
sentences[0]

'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.'

In [None]:
first_sentence = result[0]

first_sentence[0]
# ['we', 'introduce', 'a', 'new', 'language', 'representation', 'model', 'called', 'bert', ',', 'which', 'stands', 'for', 'bidirectional', 'encoder', 'representations', 'from', 'transformers']
len(first_sentence[0])
# 18


len(first_sentence[1])
# 18
first_token_in_first_sentence = first_sentence[1]
first_token_in_first_sentence[1]
# array([ 0.4805648 ,  0.18369392, -0.28554988, ..., -0.01961522,
#        1.0207764 , -0.67167974], dtype=float32)
first_token_in_first_sentence[1].shape
# (768,)