<a href="https://colab.research.google.com/github/aliakbarbadri/nlp-tf/blob/master/week1-examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

# Lesson 1 and Lesson 2

In [0]:
train_sentences = [
                   "i love my dog",
                   "I, love my dog!",
                   "i love my cat",
                   "this is a longer sentence to test padding"
]

In [3]:
tokenizer = Tokenizer(num_words=100, oov_token="<oov>")
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
word_index

{'<oov>': 1,
 'a': 9,
 'cat': 6,
 'dog': 5,
 'i': 2,
 'is': 8,
 'longer': 10,
 'love': 3,
 'my': 4,
 'padding': 14,
 'sentence': 11,
 'test': 13,
 'this': 7,
 'to': 12}

In [4]:
sequences = tokenizer.texts_to_sequences(train_sentences)
sequences

[[2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 6], [7, 8, 9, 10, 11, 12, 13, 14]]

In [5]:
padded = pad_sequences(sequences, maxlen=8, padding="pre")
padded

array([[ 0,  0,  0,  0,  2,  3,  4,  5],
       [ 0,  0,  0,  0,  2,  3,  4,  5],
       [ 0,  0,  0,  0,  2,  3,  4,  6],
       [ 7,  8,  9, 10, 11, 12, 13, 14]], dtype=int32)

In [0]:
test_sentences = [
                  "Stress will kill you!",
                  "I love you"
]

In [7]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_sequences

[[1, 1, 1, 1], [2, 3, 1]]

In [8]:
padded = pad_sequences(test_sequences, padding="post")
padded

array([[1, 1, 1, 1],
       [2, 3, 1, 0]], dtype=int32)

# Lesson 3

In [9]:
!wget https://raw.githubusercontent.com/aliakbarbadri/nlp-tf/master/Sarcasm_Headlines_Dataset_v2.json \
    -O /tmp/sarcasm.json

--2020-05-12 12:23:53--  https://raw.githubusercontent.com/aliakbarbadri/nlp-tf/master/Sarcasm_Headlines_Dataset_v2.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6057046 (5.8M) [text/plain]
Saving to: ‘/tmp/sarcasm.json’


2020-05-12 12:23:53 (50.7 MB/s) - ‘/tmp/sarcasm.json’ saved [6057046/6057046]



In [10]:
sarcasm_data = [json.loads(line) for line in open('/tmp/sarcasm.json', 'r')]
len(sarcasm_data), sarcasm_data[0]

(28619,
 {'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205',
  'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
  'is_sarcastic': 1})

In [0]:
sarcasm_sentences = [] 
sarcasm_labels = []
sarcasm_urls = []
for item in sarcasm_data:
    sarcasm_sentences.append(item['headline'])
    sarcasm_labels.append(item['is_sarcastic'])
    sarcasm_urls.append(item['article_link'])

In [12]:
tokenizer.fit_on_texts(sarcasm_sentences)
sarcasm_word_index = tokenizer.word_index
print("sarcasm_word_index length =",len(sarcasm_word_index))
for key in list(sarcasm_word_index)[0:20]:
    print(key,sarcasm_word_index[key])

sarcasm_word_index length = 30886
<oov> 1
to 2
of 3
the 4
in 5
for 6
a 7
on 8
and 9
with 10
is 11
new 12
trump 13
man 14
at 15
from 16
about 17
by 18
after 19
you 20


In [13]:
sarcasm_sequences = tokenizer.texts_to_sequences(sarcasm_sentences)
sarcasm_padded = pad_sequences(sarcasm_sequences, padding='post')
sarcasm_padded.shape, sarcasm_padded[0]

((28619, 152),
 array([1, 1, 1, 1, 1, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32))