https://arxiv.org/pdf/1810.04805.pdf
https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
https://colab.research.google.com/drive/1hMLd5-r82FrnFnBub-B-fVW78Px4KPX1#scrollTo=IW6V3afD-q1K
https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model

import tensorflow_hub as hub
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
import math

In [2]:
#read from dataset and split into input and targets
df = pd.read_csv("char_cleaned_data3.csv")
df.dropna(subset = ["x1"], inplace=True)
df.dropna(subset = ["y1"], inplace=True)
df.head()

Unnamed: 0,x1,y1
0,what kind of phones do you guys have,i have a it is pretty great much better than w...
1,i have a it is pretty great much better than w...,does it really charge all the way in min
2,does it really charge all the way in min,pretty fast i have never it but it is under ha...
3,what kind of phones do you guys have,samsung galaxy j it is my first cell phone and...
4,samsung galaxy j it is my first cell phone and...,what do you think of it anything you do not like


In [3]:
num_samples = 10000

print(len(df))
questions = df["x1"].tolist()[:num_samples]
answers = df["y1"].tolist()[:num_samples]

print(len(questions))
print(len(answers))

47365
10000
10000


In [4]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [5]:
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

In [6]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [7]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [8]:
tokens = set()
for line in questions[:10]:
    t = tokenizer.tokenize(line)
    for i in t:
        tokens.add(i)
    
tokens.add("[CLS]")
tokens.add("[SEP]")


In [9]:
tokens

{'##imum',
 '##s',
 '[CLS]',
 '[SEP]',
 'a',
 'all',
 'and',
 'anything',
 'before',
 'better',
 'cell',
 'charge',
 'do',
 'does',
 'first',
 'for',
 'friend',
 'galaxy',
 'great',
 'guys',
 'had',
 'have',
 'i',
 'in',
 'is',
 'it',
 'j',
 'kill',
 'kind',
 'know',
 'like',
 'me',
 'min',
 'months',
 'much',
 'my',
 'myself',
 'not',
 'of',
 'old',
 'op',
 'opt',
 'phone',
 'phones',
 'pretty',
 'really',
 'samsung',
 'than',
 'the',
 'think',
 'to',
 'told',
 'v',
 'way',
 'what',
 'you',
 'yourself'}

In [10]:
def tokenize (s):
    stokens = tokenizer.tokenize(s)
    stokens =  ["[CLS]"] + stokens + ["[SEP]"]
    
    return stokens

input_ids = []
input_masks = []
input_segments = []

for line in questions[:10]:
    stokens = tokenize(line)
    print(stokens)
    input_ids.append(get_ids(stokens, tokenizer, max_seq_length))
    input_masks.append(get_masks(stokens, max_seq_length))
    input_segments.append(get_segments(stokens, max_seq_length))

['[CLS]', 'what', 'kind', 'of', 'phones', 'do', 'you', 'guys', 'have', '[SEP]']
['[CLS]', 'i', 'have', 'a', 'it', 'is', 'pretty', 'great', 'much', 'better', 'than', 'what', 'i', 'had', 'before', '[SEP]']
['[CLS]', 'does', 'it', 'really', 'charge', 'all', 'the', 'way', 'in', 'min', '[SEP]']
['[CLS]', 'what', 'kind', 'of', 'phones', 'do', 'you', 'guys', 'have', '[SEP]']
['[CLS]', 'samsung', 'galaxy', 'j', 'it', 'is', 'my', 'first', 'cell', 'phone', 'and', 'i', 'have', 'had', 'it', 'for', 'months', '[SEP]']
['[CLS]', 'what', 'do', 'you', 'think', 'of', 'it', 'anything', 'you', 'do', 'not', 'like', '[SEP]']
['[CLS]', 'what', 'kind', 'of', 'phones', 'do', 'you', 'guys', 'have', '[SEP]']
['[CLS]', 'opt', '##imum', '##s', 'v', 'i', 'know', 'it', 'is', 'old', '[SEP]']
['[CLS]', 'my', 'friend', 'told', 'me', 'to', 'kill', 'myself', '[SEP]']
['[CLS]', 'do', 'not', 'kill', 'yourself', 'op', '[SEP]']


In [11]:
print(input_ids)

[[101, 2054, 2785, 1997, 11640, 2079, 2017, 4364, 2031, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2031, 1037, 2009, 2003, 3492, 2307, 2172, 2488, 2084, 2054, 1045, 2018, 2077, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2515, 2009, 2428, 3715, 2035, 1996, 2126, 1999, 8117, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
input_ids = np.array(input_ids)
input_masks= np.array(input_masks)
input_segments = np.array(input_segments)

In [39]:
# def convert_single_example(tokenizer, example, max_seq_length=256):
#     """Converts a single `InputExample` into a single `InputFeatures`."""

#     if isinstance(example, PaddingInputExample):
#         input_ids = [0] * max_seq_length
#         input_mask = [0] * max_seq_length
#         segment_ids = [0] * max_seq_length
#         label = 0
#         return input_ids, input_mask, segment_ids, label

#     tokens_a = tokenizer.tokenize(example)
#     if len(tokens_a) > max_seq_length - 2:
#         tokens_a = tokens_a[0 : (max_seq_length - 2)]

#     tokens = []
#     segment_ids = []
#     tokens.append("[CLS]")
#     segment_ids.append(0)
#     for token in tokens_a:
#         tokens.append(token)
#         segment_ids.append(0)
#     tokens.append("[SEP]")
#     segment_ids.append(0)

#     input_ids = tokenizer.convert_tokens_to_ids(tokens)

#     # The mask has 1 for real tokens and 0 for padding tokens. Only real
#     # tokens are attended to.
#     input_mask = [1] * len(input_ids)

#     # Zero-pad up to the sequence length.
#     while len(input_ids) < max_seq_length:
#         input_ids.append(0)
#         input_mask.append(0)
#         segment_ids.append(0)

#     assert len(input_ids) == max_seq_length
#     assert len(input_mask) == max_seq_length
#     assert len(segment_ids) == max_seq_length

#     return input_ids, input_mask, segment_ids, example.label


# def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
#     """Convert a set of `InputExample`s to a list of `InputFeatures`."""

#     input_ids, input_masks, segment_ids, labels = [], [], [], []
#     for example in tqdm_notebook(examples, desc="Converting examples to features"):
#         input_id, input_mask, segment_id, label = convert_single_example(
#             tokenizer, example, max_seq_length
#         )
#         input_ids.append(input_id)
#         input_masks.append(input_mask)
#         segment_ids.append(segment_id)
#         labels.append(label)
#     return (
#         np.array(input_ids),
#         np.array(input_masks),
#         np.array(segment_ids),
#         np.array(labels).reshape(-1, 1),
#     )

In [13]:
pool_embs, all_embs = model.predict([input_ids,input_masks,input_segments])

In [15]:
all_embs

array([[[ 3.67991626e-01,  2.62976646e-01, -9.17949900e-02, ...,
         -3.43436956e-01,  1.00064501e-01,  3.28222066e-01],
        [ 2.88614124e-01, -1.47708416e-01, -3.50446224e-01, ...,
         -3.76085639e-01,  1.22872666e-01, -2.93080926e-01],
        [ 9.05614257e-01, -3.38526458e-01,  1.01535487e+00, ...,
         -3.64174902e-01, -7.62960389e-02, -9.65847194e-01],
        ...,
        [ 3.69523287e-01,  1.84878424e-01,  1.90430164e-01, ...,
          2.34707654e-01,  1.58575922e-02,  1.34283043e-02],
        [ 2.84774601e-01,  1.19270198e-01,  1.98776037e-01, ...,
          2.09813222e-01, -7.27497786e-03, -3.44730616e-02],
        [ 3.29375029e-01,  1.48037493e-01,  3.57326537e-01, ...,
          1.69766128e-01, -2.93716788e-04, -1.96482390e-02]],

       [[ 1.37833685e-01,  1.95774108e-01,  8.44631419e-02, ...,
         -1.22579433e-01,  1.22901604e-01,  3.55702698e-01],
        [ 3.76964927e-01,  3.50292265e-01,  3.04621458e-01, ...,
         -1.58174992e-01,  9.36976075e