<a href="https://colab.research.google.com/github/bodamohannaik/DLAI-TF-DPC/blob/master/C3/W1/tensorflow_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import tensorflow as tf

# Tokenizer

In [2]:
sentences = [
             'I eat mango',
             'I eat banana',
             'I don"t eat grapes'
]

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50, lower=True, oov_token="<OOV>")

In [4]:
tokenizer.fit_on_texts(sentences)

In [5]:
tokenizer.num_words

50

In [6]:
tokenizer.document_count

3

In [7]:
tokenizer.oov_token

'<OOV>'

In [8]:
tokenizer.word_counts

OrderedDict([('i', 3),
             ('eat', 3),
             ('mango', 1),
             ('banana', 1),
             ('don', 1),
             ('t', 1),
             ('grapes', 1)])

In [9]:
tokenizer.index_word

{1: '<OOV>',
 2: 'i',
 3: 'eat',
 4: 'mango',
 5: 'banana',
 6: 'don',
 7: 't',
 8: 'grapes'}

In [10]:
tokenizer.word_index

{'<OOV>': 1,
 'banana': 5,
 'don': 6,
 'eat': 3,
 'grapes': 8,
 'i': 2,
 'mango': 4,
 't': 7}

In [11]:
tokenizer.texts_to_sequences(sentences)

[[2, 3, 4], [2, 3, 5], [2, 6, 7, 3, 8]]

In [12]:
tokenizer.texts_to_sequences(['you eat rice'])

[[1, 3, 1]]

# Padding sequences

In [13]:
sentences = [
             'I eat mango',
             'I eat banana',
             'I don"t eat grapes',
             'I eat rice and bread'
]

In [14]:
tokenizer.texts_to_sequences(sentences)

[[2, 3, 4], [2, 3, 5], [2, 6, 7, 3, 8], [2, 3, 1, 1, 1]]

In [15]:

tf.keras.preprocessing.sequence.pad_sequences(
    sequences = tokenizer.texts_to_sequences(texts = sentences),
    maxlen=10, padding='post', truncating='post', value = 0
)

array([[2, 3, 4, 0, 0, 0, 0, 0, 0, 0],
       [2, 3, 5, 0, 0, 0, 0, 0, 0, 0],
       [2, 6, 7, 3, 8, 0, 0, 0, 0, 0],
       [2, 3, 1, 1, 1, 0, 0, 0, 0, 0]], dtype=int32)

# Sarcasm

In [16]:
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2022-05-14 05:51:39--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.125.128, 142.250.157.128, 142.251.8.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.125.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2022-05-14 05:51:39 (158 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [17]:
import json
with open('sarcasm.json', 'r') as fp:
  sarcasm = json.load(fp)

In [18]:
type(sarcasm)

list

In [19]:
type(sarcasm[0])

dict

In [20]:
sarcasm[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [21]:
links = [x['article_link'] for x in sarcasm]
headlines = [x['headline'] for x in sarcasm]
labels = [x['is_sarcastic'] for x in sarcasm]

In [23]:
len(headlines)

26709

In [33]:
sarcasm_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=500, lower=True, split=' ', oov_token = '<OOV>')
sarcasm_tokenizer.fit_on_texts(headlines)
len(sarcasm_tokenizer.index_word)

29657

In [34]:
sequence_headlines = sarcasm_tokenizer.texts_to_sequences(headlines)

In [35]:
sequence_headlines[0]

[308, 1, 1, 1, 1, 48, 382, 1, 1, 6, 1, 1]

In [36]:
sequence_headlines[0]

[308, 1, 1, 1, 1, 48, 382, 1, 1, 6, 1, 1]

In [37]:
tf.keras.preprocessing.sequence.pad_sequences(sequence_headlines[:10], padding="post", truncating="post", value = 0, maxlen=None)

array([[308,   1,   1,   1,   1,  48, 382,   1,   1,   6,   1,   1,   0,
          0],
       [  4,   1,   1,   1,  22,   2, 166,   1, 416,   1,   6, 258,   9,
          1],
       [145,   1,   2,   1,   1,   1,   1,   1, 221, 143,  39,  46,   2,
          1],
       [  1,  36, 224, 400,   2,   1,  29, 319,  22,  10,   1,   1,   1,
          1],
       [  1,   1,   1,   1,   1,   1,   1,   5,   4,  95,   1,  92,   0,
          0],
       [  1,   4, 365,  73,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [  4,   1, 351,   6, 461,   1,   1,   1,   0,   0,   0,   0,   0,
          0],
       [ 19, 479,  39,   1,  31, 155,   2,  99,  83,  18, 158,   6,  32,
        352],
       [249,   1,   1,   1,   1,   1, 141,   0,   0,   0,   0,   0,   0,
          0],
       [  1, 326, 347, 401,  60,   1,   6,   4,   1,   0,   0,   0,   0,
          0]], dtype=int32)