In [1]:
import json

In [64]:
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)
        
datastore = list(parse_data('Sarcasm_Headlines_Dataset.json'))

In [65]:
sentences = []
labels = []
urls = []

In [66]:
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [67]:
sentences[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [68]:
labels[:10]

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0]

In [69]:
urls[:10]

['https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
 'https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15e64fdcb',
 'https://www.huffingtonpost.com/entry/advancing-the-worlds-women_b_6810038.html',
 'https://www.huffingtonpost.com/entry/how-meat-is-grown-in-a-lab_us_561d1189e4b0c5a1ce607e86',
 'https://www.huffingtonpost.com/entry/boxed-college-tuition-ben_n_7445644.html',
 'https://politics.theonion.com/top-snake-handler-leaves-sinking-huckabee-campaign-1819578231',
 'https://www.huffingtonpost.com/entry/fridays-morning-email-inside-trumps-presser-for-the-ages_us_58a6e33ee4b07602ad53a315']

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

### What Does a Tokenizer Do?
1. Fitting on Text Data
    - The tokenizer is fitted on a corpus of text, meaning it processes all the words in the dataset and creates a vocabulary (a mapping of words to unique integer indices).
    - For example, if your text data contains sentences like "I love programming" and "Programming is fun," the tokenizer builds a dictionary of all unique words: {'I': 1, 'love': 2, 'programming': 3, 'is': 4, 'fun': 5}.
2. Text to integer conversion
    - The tokenizer converts each word in the text data to its corresponding integer index.
    - For example, the sentence "I love programming" is converted to [1, 2, 3].
3. Handling Out-of-Vocabulary Words
    - The tokenizer can be configured to handle out-of-vocabulary words in different ways. For example, it can replace them with a special token like `<UNK>` or ignore them altogether.
    - in this case we use `<OOV>`
4. Optional Preprocessing
    - The tokenizer can be configured to preprocess the text data in different ways, such as converting all words to lowercase or removing punctuation.
5. Padding Sequences
    - The tokenizer can be configured to pad sequences to a fixed length, which is useful when training neural networks that require inputs of the same length.
    - For example, if the maximum sequence length is set to 5, the sentence "I love programming" is padded to [1, 2, 3, 0, 0].


In [6]:
tokenizer = Tokenizer(oov_token="<OOV>")

In [7]:
tokenizer.fit_on_texts(sentences)

In [16]:
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [11]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [12]:
padded_sequences = pad_sequences(sequences, padding="post", truncating="post")
padded_sequences

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  0,  0,  0],
       [ 8,  6,  9,  2,  4, 10, 11]])

In [13]:
test_data = [
    'i really love my dogs',
    'my dog loves my mushrooms'
]

In [14]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[5, 1, 3, 2, 1], [2, 4, 1, 2, 1]]

In [15]:
padded_test_seq = pad_sequences(test_seq, padding="post", maxlen=10)
padded_test_seq

array([[5, 1, 3, 2, 1, 0, 0, 0, 0, 0],
       [2, 4, 1, 2, 1, 0, 0, 0, 0, 0]])