In [None]:
! pip install spacy

In [3]:
pip install -q -U "tensorflow-text

Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import spacy
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from tqdm import tqdm
from spacy import displacy

In [2]:
train_df = pd.read_csv("./input/train.csv")

## Preprocessing data using Spacy 

To use "en_core_web_lg" nlp package please run this command in your terminal , else error will throw
* python -m spacy download en_core_web_lg

In [44]:
nlp = spacy.load('en_core_web_lg')
doc = nlp(train_df['discourse_text'][0]) # first text from the dataframe

In [45]:
"""
displacy is to render the text in the notebook by specific terms
ent = will display all the entities are there in a text
dep = will display graph based ytantic dependeices and will display parts of speech 
"""
displacy.render(doc, style="ent") 

In [50]:
#Parts of speech of the text - text.pos_
# Lemma will return the base of the word - text.lemma_
pos_list = []
for text in doc[0:10]:
    pos_list.append([text , text.pos_ , text.lemma_])
pos_list

[[Hi, 'INTJ', 'hi'],
 [,, 'PUNCT', ','],
 [i, 'PRON', 'I'],
 ['m, 'AUX', 'be'],
 [Isaac, 'PROPN', 'Isaac'],
 [,, 'PUNCT', ','],
 [i, 'PRON', 'I'],
 ['m, 'AUX', 'be'],
 [going, 'VERB', 'go'],
 [to, 'PART', 'to']]

In [51]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.enable_pipe('sentencizer')

In [62]:
# doc1 = nlp(doc)
for text in doc.sents:
#    for _ in text:
    displacy.render(text,style="ent")

In [70]:
doc.vector.shape

(300,)

In [86]:
doc_num = nlp(train_df['discourse_text'][0].lower())

"hi, i'm isaac, i'm going to be writing about how this face on mars is a natural landform or if there is life on mars that made it. the story is about how nasa took a picture of mars and a face was seen on the planet. nasa doesn't know if the landform was created by life on mars, or if it is just a natural landform. "

In [87]:
displacy.render(doc,style='ent')

In [39]:
_VOCAB = [b"[UNK]", b"[MASK]", b"[RANDOM]", b"[CLS]", b"[SEP]",b"[END]"]

_START_TOKEN = _VOCAB.index(b'[CLS]')
_END_TOKEN = _VOCAB.index(b'[END]')
_MASK_TOKEN = _VOCAB.index(b'[MASK]')
_RANDOM_TOKEN = _VOCAB.index(b'[RANDOM]')
_SEP_TOKEN = _VOCAB.index(b'[SEP]')
_UNKNOWN_TOKEN = _VOCAB.index(b'[UNK]')

_VOCAB_SIZE = len(_VOCAB)

lookuptable = tf.lookup.StaticVocabularyTable(
                tf.lookup.KeyValueTensorInitializer(
                    keys = _VOCAB,
                    key_dtype=tf.string,
                    values=tf.range(_VOCAB_SIZE,dtype=tf.int64),
                    value_dtype=tf.int64
                ),
                num_oov_buckets=1
)


In [40]:
train_df['discourse_text'][0]

"Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. "

In [41]:
text_tokenzier = text.BertTokenizer(lookuptable,token_out_type=tf.string)
text_tokenzier.tokenize("Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. ")

<tf.RaggedTensor [[[b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [b'[UNK]'],
  [

In [68]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")


In [43]:
encoder_inputs = preprocessor([train_df['discourse_text'][0]])

In [51]:
encoder_inputs

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  7632,  1010,  1045,  1005,  1049,  7527,  1010,  1045,
          1005,  1049,  2183,  2000,  2022,  3015,  2055,  2129,  2023,
          2227,  2006,  7733,  2003,  1037,  3019,  2455, 14192,  2030,
          2065,  2045,  2003,  2166,  2006,  7733,  2008,  2081,  2009,
          1012,  1996,  2466,  2003,  2055,  2129,  9274,  2165,  1037,
          3861,  1997,  7733,  1998,  1037,  2227,  2001,  2464,  2006,
          1996,  4774,  1012,  9274,  2987,  1005,  1056,  2113,  2065,
          1996,  2455, 14192,  2001,  2580,  2011,  2166,  2006,  7733,
          1010,  2030,  2065,  2009,  2003,  2074,  1037,  3019,  2455,
         14192,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

In [44]:
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=True)
outputs = encoder(encoder_inputs)

In [45]:
pooled_output = outputs['pooled_output']

In [69]:
embedding_model = tf.keras.Model(text_input, doc.vector)
sentences = tf.constant(["Basha"])
print(embedding_model(sentences))

ValueError: Output tensors of a Functional model must be the output of a TensorFlow `Layer` (thus holding past layer metadata). Found: [-1.6388502e+00  9.8170102e-01 -3.5083053e+00 -1.4603213e+00
  2.5865364e+00 -6.5126456e-02  3.0316040e-01  4.1716142e+00
  1.3215761e-01  7.6925611e-01  6.2784634e+00  1.9529140e+00
 -2.8802567e+00  1.0653728e+00  2.5061550e+00  9.2077428e-01
  1.9302834e+00 -1.1735537e+00 -5.7096797e-01 -2.2701375e+00
  2.4995437e+00 -9.7327185e-01 -1.5777076e+00 -1.0989767e+00
 -9.2450380e-01 -2.3687189e+00 -1.3157930e+00 -2.1876419e+00
 -1.2413212e+00  1.6353768e+00 -7.3108107e-01 -1.3623276e+00
 -1.8963171e+00 -1.0639617e+00 -1.5508072e+00 -4.5057386e-01
 -5.6444073e-01  1.4443216e+00  3.4425600e+00  1.4573760e+00
 -6.5039366e-01  3.1811258e-01  9.4922084e-01 -7.9766989e-01
 -9.8454535e-01  2.4077947e+00  2.1995032e+00 -4.0448079e+00
 -9.2925215e-01  2.2726848e+00 -1.7401269e-01  5.2170229e-01
  4.3472964e-01 -4.4175196e+00 -1.7052845e+00 -1.8668878e-01
  1.6842152e+00  1.8334684e+00  1.9559307e-02  1.0533800e+00
  1.6240237e+00 -5.1595324e-01 -1.0556552e+00 -1.8572745e+00
 -4.4830614e-01  1.2938776e+00 -2.6437109e+00 -3.9279304e+00
  6.7561090e-01  3.3770020e+00 -1.2377387e+00  6.3179296e-01
 -3.0890808e+00 -9.2200398e-01 -7.5395477e-01  1.4635874e+00
 -2.6328647e+00  5.8671838e-01 -3.1122048e+00 -1.4853683e+00
 -3.5925019e+00 -9.0457928e-01  2.6442163e+00  7.3590726e-01
  1.6794238e+00 -3.2729104e-01 -1.7280924e-01 -2.1610258e+00
  2.0986254e+00 -1.4968735e+00 -1.1144214e+00 -2.0745757e+00
  2.4610586e+00 -5.3111992e+00  6.6212231e-01 -2.6543918e+00
  1.2785629e+00 -1.1522425e+00 -1.0213540e-01 -9.0020829e-01
  2.2124939e+00  1.2656025e+00  1.7484128e+00  2.4436119e+00
 -5.1200920e-01  3.9480648e+00  3.5562128e-01 -1.0096823e+00
 -1.2655197e+00 -1.3710920e+00  4.8538360e-01  1.0193845e+00
 -6.8567353e-01  1.5078901e+00  1.1484847e+00  1.1693500e+00
 -1.9256405e+00 -8.6966836e-01  2.0926356e-01 -1.5142665e+00
 -1.6797714e+00 -2.8695924e+00 -4.1191903e-01  1.9779531e+00
 -1.5834309e+00 -3.6900871e+00  1.5921605e+00 -1.4703733e+00
  2.0056138e+00 -1.2378743e+00 -2.1566374e+00  3.4771997e-01
  2.8722532e+00 -3.0759056e+00 -3.9849904e-01  8.1287807e-01
 -1.0720620e+00 -1.5841624e+00  4.3312273e+00 -2.4475791e+00
 -1.7784846e+00 -1.2929100e+00  8.7318581e-01  1.7913809e+00
  7.3232859e-01  6.6992486e-01 -3.7304037e+00  2.6036674e-01
  7.4065693e-02  9.6584308e-01 -1.1477481e+00  3.0739110e+00
 -5.9495438e-02 -8.1894077e-02 -7.4207342e-01  1.5691304e+00
  3.0630310e+00  7.4038243e-01 -9.9650455e-01 -5.5160624e-01
 -1.3176674e+00 -3.2868459e+00 -8.1009310e-01  3.2517961e-01
 -2.6843810e+00 -9.0371686e-01 -3.4887893e+00  3.0896013e+00
 -9.3788320e-01  1.1897588e-01  9.6492255e-01 -8.9322871e-01
  1.9874659e+00  1.5735323e+00  1.2174255e+00 -1.6569481e+00
 -9.5430487e-01 -2.6639646e-01 -2.9983237e+00 -2.1904733e+00
 -3.8639185e-01  6.2347645e-01  3.7714205e+00 -1.0601476e+00
 -1.0001069e+00  8.8191777e-01 -1.4612850e+00 -1.5596120e+00
  1.3283343e+00  2.1000891e+00 -1.1779108e+00 -7.5683695e-01
 -1.7087047e+00 -7.1810716e-01 -6.1134970e-01 -2.4914935e-01
 -2.0177565e+00  7.2806549e-01  5.8099061e-01  1.4585537e+00
 -2.2939477e+00 -1.5291135e+00 -1.5099289e+00 -3.2321315e+00
  6.1497670e-01  2.3320947e+00 -3.6664815e+00  6.2144011e-01
 -1.7255962e+00 -1.7187570e+00  2.2244904e+00  3.1124595e-01
 -1.9233246e+00  2.9262867e+00 -6.4434284e-01  6.5985173e-01
  3.7461834e-03 -2.1112857e+00 -2.0878343e-01  1.0766115e+00
 -2.4515038e+00 -1.2157291e+00  6.6348791e-01  6.9953376e-01
 -5.6131536e-01 -1.6413348e+00  1.0857105e+00  1.4758211e+00
  2.8117948e+00 -5.8064111e-02  1.6607553e-01 -4.7130775e+00
 -3.7049344e-01  1.3344603e+00  3.2366285e-01  1.0987030e+00
 -1.6346524e+00  1.1686294e+00  2.3944382e-01 -7.4882221e-01
 -1.4736948e+00 -3.0995882e-01  2.0879261e+00  3.3721754e-01
 -1.7368939e+00 -6.8033016e-01 -2.5193825e+00  1.2684470e+00
  1.2490822e+00  2.9506516e+00 -1.3877697e-01 -6.4308608e-01
 -5.3445492e+00 -6.6599011e-01  1.7340769e-01 -3.0730164e+00
  1.4694253e+00  1.5803531e+00 -7.4518591e-02  1.3910463e+00
  2.6198113e-01  5.9126120e+00  4.1149664e+00  3.7905061e+00
  6.1283034e-01 -1.8621488e-01  4.1750160e-01  2.4414737e+00
 -4.9565382e+00  1.5875116e-01  1.8287042e+00 -1.5839664e+00
 -5.1569861e-01 -2.1890948e+00  6.2454796e-01  1.0140430e-01
  2.9440637e+00 -3.1108832e-01 -1.0851415e+00  1.9169365e+00
  1.4631879e-01 -5.2558589e-01  1.0758524e+00  1.8683195e+00
  4.1380458e+00 -6.2021714e-01  9.9774951e-01  1.1183604e+00
 -1.9896706e+00  1.4364551e+00  1.7458793e+00 -9.9778289e-01
  1.3292212e+00  2.3253212e-02 -1.7088943e+00  5.1940012e-01
  1.2845680e+00 -2.0319632e-01 -3.2012630e+00  1.5914700e+00]

In [23]:
encoder_inputs

{'input_word_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[  101,  7632,  1010,  1045,  1005,  1049,  7527,  1010,  1045,
          1005,  1049,  2183,  2000,  2022,  3015,  2055,  2129,  2023,
          2227,  2006,  7733,  2003,  1037,  3019,  2455, 14192,  2030,
          2065,  2045,  2003,  2166,  2006,  7733,  2008,  2081,  2009,
          1012,  1996,  2466,  2003,  2055,  2129,  9274,  2165,  1037,
          3861,  1997,  7733,  1998,  1037,  2227,  2001,  2464,  2006,
          1996,  4774,  1012,  9274,  2987,  1005,  1056,  2113,  2065,
          1996,  2455, 14192,  2001,  2580,  2011,  2166,  2006,  7733,
          1010,  2030,  2065,  2009,  2003,  2074,  1037,  3019,  2455,
         14192,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     