In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [2]:
sentences = [
    'I love coding',
    'I hate coding',
    'I enjoy learning new technologies',
    'I dislike debugging late at night'
]

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

{'<OOV>': 1, 'i': 2, 'coding': 3, 'love': 4, 'hate': 5, 'enjoy': 6, 'learning': 7, 'new': 8, 'technologies': 9, 'dislike': 10, 'debugging': 11, 'late': 12, 'at': 13, 'night': 14}
[[2, 4, 3], [2, 5, 3], [2, 6, 7, 8, 9], [2, 10, 11, 12, 13, 14]]


In [3]:
test_data = [
    'I admire creative ideas',
    'I avoid complex problems'
]

test_sequences = tokenizer.texts_to_sequences(test_data)

print(test_sequences)

[[2, 1, 1, 1], [2, 1, 1, 1]]


In [4]:
padded_sequences = pad_sequences(sequences, padding='post', truncating='post', maxlen=3)

print(padded_sequences)

[[ 2  4  3]
 [ 2  5  3]
 [ 2  6  7]
 [ 2 10 11]]


In [15]:
import pandas as pd

In [17]:
sarcasm_headlines = pd.read_json("../data/sarcasm-headlines.json", lines=True)

sentences = sarcasm_headlines['headline'].tolist()
labels = sarcasm_headlines['is_sarcastic'].tolist()
urls = sarcasm_headlines['article_link'].tolist()

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(
    sentences,
    labels,
    test_size=0.25,
    random_state=42
)

In [26]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, padding='post', truncating='post', maxlen=40)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding='post', truncating='post', maxlen=40)

In [27]:
from tensorflow.keras import layers

In [37]:
embedding_dim = 32

model = keras.Sequential([
    layers.Embedding(100, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(24, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
training_padded = tf.constant(training_padded)
testing_padded = tf.constant(testing_padded)
training_labels = tf.constant(training_labels)
testing_labels = tf.constant(testing_labels)

In [39]:
history = model.fit(training_padded, training_labels, epochs=30, validation_data=(testing_padded, testing_labels), verbose=1)

Epoch 1/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7638 - loss: 0.4693 - val_accuracy: 0.7492 - val_loss: 0.4866
Epoch 2/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7633 - loss: 0.4686 - val_accuracy: 0.7525 - val_loss: 0.4897
Epoch 3/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7647 - loss: 0.4686 - val_accuracy: 0.7574 - val_loss: 0.4827
Epoch 4/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7656 - loss: 0.4682 - val_accuracy: 0.7591 - val_loss: 0.4795
Epoch 5/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7661 - loss: 0.4683 - val_accuracy: 0.7529 - val_loss: 0.4844
Epoch 6/30
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7664 - loss: 0.4680 - val_accuracy: 0.7552 - val_loss: 0.4812
Epoch 7/30
[1m626/626[0m 

In [40]:
sentence = [
    "Oh great, another Monday—just what I needed!",
    "I’m excited to start my new project today."
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=40)

print(model.predict(padded))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[[0.06529946]
 [0.11236033]]
