### Feature (text-embeddings) extraction from BERT

In [None]:
import torch
import json
import re
import numpy as np
import pandas as pd
import tensorflow as tf

import modeling
import tokenization

In [None]:
BERT_MODEL = 'uncased_L-12_H-768_A-12'  ## uncased BERT base model
BERT_PRETRAINED_DIR = './Akshay/Desktop/DataWeave/Experiments/' + BERT_MODEL

In [None]:
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
                     use_one_hot_embeddings):

def model_fn(features, labels, mode, params):

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    input_type_ids = features["input_type_ids"]

    model = modeling.BertModel(
        config=bert_config,
        is_training=False,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=input_type_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    if mode != tf.estimator.ModeKeys.PREDICT:
        exit

    tvars = tf.trainable_variables()
    scaffold_fn = None
    (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)
    
    if use_tpu:
        def tpu_scaffold():
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
    else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    for var in tvars:
        init_string = ""
        if var.name in initialized_variable_names:
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    all_layers = model.get_all_encoder_layers()

    predictions = {
        "unique_id": unique_ids,
    }

    for (i, layer_index) in enumerate(layer_indexes):
        predictions["layer_output_%d" % i] = all_layers[layer_index]

    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

In [None]:
def convert_examples_to_features(examples, seq_length, tokenizer):

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
        if len(tokens_a) > seq_length - 2:
            tokens_a = tokens_a[0:(seq_length - 2)]

    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
        input_ids.append(0)
        input_mask.append(0)
        input_type_ids.append(0)

    if ex_index < 5:
        [tokenization.printable_text(x) for x in tokens]))

        features.append(
            InputFeatures(
            unique_id=example.unique_id,
            tokens=tokens,
            input_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids))
        
    return features

In [None]:
def read_sequence(input_sentences):
    sentences = []
    unique_id = 0
    for sentence in input_sentences:
        line = tokenization.convert_to_unicode(sentence)
        sentences.append(InputExample(unique_id=unique_id, text_a=line))
    unique_id += 1
    return sentences

In [None]:
embeddings = read_sequence()

### Sentence BERT

In [1]:
from sentence_transformers import SentenceTransformer

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

## Other available models
# all-roberta-large-v1
# all-distilroberta-v1 
# all-MiniLM-L6-v2
# distiluse-base-multilingual-cased-v1
# paraphrase-TinyBERT-L6-v2

In [3]:
sentences = ['A test sentences for checking',
    'Amazon and Flipkart are two major ecomm merchants in India', 
    'One plus nord mobile with 5000 mAH battery on offer']

In [4]:
sentence_embeddings = model.encode(sentences)

In [9]:
sentence_embeddings.shape

(3, 768)

In [None]:
## Printing the embeddings

for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [13]:
from sentence_transformers import util

In [92]:
emb1 = model.encode("Salt Water Sandals by Hoy Original Sandal (Baby, Walker, Toddler, Little Kid & Big Kid) SHINY YELLOW Little Kid 2 M")
emb2 = model.encode("Salt Water Sandals by Hoy Shoe The Original Sandal Shiny Yellow 1 Little Kid  ")

In [93]:
import numpy as np

cos = np.dot(emb1, emb2)
cos = cos / (np.linalg.norm(emb1)* np.linalg.norm(emb2))
print("Cosine similarity: ", cos)

Cosine similarity:  0.9377158


In [94]:
dot_prod = util.dot_score(emb1, emb2)
print("Dot product: ", dot_prod)

Dot product:  tensor([[0.9377]])


In [90]:
t1 = model.encode("I bought an Apple PC on sale")
t2 = model.encode("I bought an Apple fruit on sale")

In [91]:
cos_sim = util.cos_sim(t1, t2)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.6631]])


### comparision with spacy

In [95]:
import spacy
nlp = spacy.load('en_core_web_md')

In [100]:
text1 = nlp("Salt Water Sandals by Hoy Original Sandal (Baby, Walker, Toddler, Little Kid & Big Kid) SHINY YELLOW Little Kid 2 M")
text2 = nlp("Salt Water Sandals by Hoy Shoe The Original Sandal Shiny Yellow 1 Little Kid  ")

In [97]:
print(text1.similarity(text2))

0.9035214342821734


In [104]:
text1 = nlp("I bought an Apple PC on sale")
text2 = nlp("I bought an Apple fruit on sale")

print(text1.similarity(text2))

0.941827671682806


In [102]:
from spacy import displacy

for ent in text1.ents:
    print(ent.text,  ent.label_, "\tstart_pos" , ent.start_char, "end_pos", ent.end_char)
    #tokens.append(text1[ent.start_char:ent.end_char])
    
displacy.serve(text1, style="ent")

Salt Water Sandals ORG 	start_pos 0 end_pos 18
Hoy Original Sandal ORG 	start_pos 22 end_pos 41
Walker PERSON 	start_pos 49 end_pos 55
Toddler ORG 	start_pos 57 end_pos 64


  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#### top pairwise matches in case of multiple sentences

In [None]:
sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]

#Encode all sentences
embeddings = model.encode(sentences)

In [None]:
# cosine similarity bet all pairs

cos_sim = util.cos_sim(embeddings, embeddings)

In [None]:
# Adding all pairs of list with their cosine similarity

all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

In [None]:
# Sorting the list by highest cosine similarity
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

In [None]:
n = 3  ## top 'n' matches 
    
print("Top {} most similar pairs: ".format(n))

for score, i, j in all_sentence_combinations[0:n]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

In [None]:
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)