In [1]:
# Import the packages
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Define the parameters

# Size of the vocabulary
vocab_size = 10000

# Embedding dimension
embedding_dim = 16

# Max length of the sentences
max_length = 100

# Type of truncature
trunc_type = 'post'

# Type of padding
padding_type = 'post'

# Name of the token oov
oov_tok = "<OOV>"

# Training size
training_size = 20000

In [3]:
# Download the data
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json


--2019-10-25 19:48:35--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
R'esolution de storage.googleapis.com (storage.googleapis.com)... 172.217.168.240
Connexion `a storage.googleapis.com (storage.googleapis.com)|172.217.168.240|:443... connect'e.
requ^ete HTTP transmise, en attente de la r'eponse... 200 OK
Taille : 5643545 (5.4M) [application/json]
Sauvegarde en : << /tmp/sarcasm.json >>


2019-10-25 19:48:40 (1.25 MB/s) - << /tmp/sarcasm.json >> sauvegard'e [5643545/5643545]



In [4]:
with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

# Save the data in lists - list of sentences and list of labels
sentences = []
labels = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [5]:
# Separate to have training + testing
# Sentences
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
# Labels
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [6]:
# Define the tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
# Fit the tokenizer with the training sentences
tokenizer.fit_on_texts(training_sentences)
# Get the dictionary of words
word_index = tokenizer.word_index
# Transform the training sentences in sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
# Pad the sequences
training_padded = pad_sequences(training_sequences, maxlen = max_length, padding = 'post', truncating = 'post')

In [7]:
# padding: String, 'pre' or 'post': pad either before or after each sequence.
# truncating: String, 'pre' or 'post': remove values from sequences larger than

In [8]:
training_padded[0]

array([ 328,    1,  799, 3405, 2404,   47,  389, 2214,    1,    6, 2614,
       8863,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [9]:
# Transform test sentences in sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
# Pad the test sequences
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(), # Add Global Average Pooling 1D
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

# Define the learning algorithm
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [12]:
print(training_padded.shape)
print(len(training_labels))
print(training_padded[0])
print(training_labels[0])

(20000, 100)
20000
[ 328    1  799 3405 2404   47  389 2214    1    6 2614 8863    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
0


In [13]:
print(testing_padded.shape)
print(len(testing_labels))

(6709, 100)
6709


In [14]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs = 30)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'int'>"})

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(training_padded[0]))
print(training_sentences[2])
print(labels[2])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('vecs.tsv')
    files.download('meta.tsv')

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))