In [None]:
!pip install bert-for-tf2
!pip install tensorflow-text

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/820ccaf55f1e24b5dd43583ac0da6d86c2d27bbdfffadbba69bafe73ca93/bert-for-tf2-0.14.7.tar.gz (41kB)
[K     |████████                        | 10kB 17.7MB/s eta 0:00:01[K     |████████████████                | 20kB 16.4MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 14.4MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 12.6MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.0MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def get_model(model_url, max_seq_length):
  inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    )

  muril_layer = hub.KerasLayer(model_url, trainable=True)
  outputs = muril_layer(inputs)

  assert 'sequence_output' in outputs
  assert 'pooled_output' in outputs
  assert 'encoder_outputs' in outputs
  assert 'default' in outputs
  return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer

In [None]:
max_seq_length = 128
muril_model, muril_layer = get_model(
    model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)

In [None]:
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def create_input(input_strings, tokenizer, max_seq_length):
  input_ids_all, input_mask_all, input_type_ids_all = [], [], []
  for input_string in input_strings:
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)
    
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    input_type_ids_all.append([0] * max_seq_length)
  
  return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

In [None]:
def encode(input_text):
  input_ids, input_mask, input_type_ids = create_input(input_text, 
                                                       tokenizer, 
                                                       max_seq_length)
  inputs = dict(
      input_word_ids=input_ids,
      input_mask=input_mask,
      input_type_ids=input_type_ids,
  )
  return muril_model(inputs)

In [None]:
code_mix_sentences = ["मे घर जाऊंगा","मैं घर जा रही हूँ","i am going home",'main ghar ja raha hoon','apka naam kya hai']

In [None]:
code_mix_embedding = encode(code_mix_sentences)

In [None]:
dst_1 = cosine_similarity(np.array(code_mix_embedding[0]).reshape(1,-1), 
                           np.array(code_mix_embedding[1]).reshape(1,-1))
print("Distance between {} & {} is {}".format(code_mix_sentences[0],
                                                code_mix_sentences[1],
                                                dst_1))

Distance between मे घर जाऊंगा & मैं घर जा रही हूँ is [[0.9994648]]


In [None]:
dst_1 = cosine_similarity(np.array(code_mix_embedding[0]).reshape(1,-1), 
                           np.array(code_mix_embedding[2]).reshape(1,-1))
print("Distance between {} & {} is {}".format(code_mix_sentences[0],
                                                code_mix_sentences[2],
                                                dst_1))

Distance between मे घर जाऊंगा & i am going home is [[0.99958205]]


In [None]:
dst_1 = cosine_similarity(np.array(code_mix_embedding[0]).reshape(1,-1), 
                           np.array(code_mix_embedding[4]).reshape(1,-1))
print("Distance between {} & {} is {}".format(code_mix_sentences[0],
                                                code_mix_sentences[4],
                                                dst_1))

Distance between मे घर जाऊंगा & apka naam kya hai is [[0.99903125]]


In [None]:
dst_1 = cosine_similarity(np.array(code_mix_embedding[0]).reshape(1,-1), 
                           np.array(code_mix_embedding[3]).reshape(1,-1))
print("Distance between {} & {} is {}".format(code_mix_sentences[0],
                                                code_mix_sentences[3],
                                                dst_1))

Distance between मे घर जाऊंगा & main ghar ja raha hoon is [[0.9992978]]
