In [6]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
import pandas as pd


In [8]:
twt_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train.csv')
train, test = train_test_split(twt_data, test_size=0.3)
validation, test = train_test_split(test, test_size=0.5)
twt_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [38]:
raw_train_ds = tf.data.Dataset.from_tensor_slices((train.text.values, train.target.values)).batch(5)
raw_test_ds = tf.data.Dataset.from_tensor_slices((test.text.values, test.target.values)).batch(5)
raw_val_ds = tf.data.Dataset.from_tensor_slices((validation.text.values, validation.target.values)).batch(5)

In [39]:
print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds)
)
print(
    "Number of batches in raw_test_ds: %d"
    % tf.data.experimental.cardinality(raw_test_ds)
)

Number of batches in raw_train_ds: 1066
Number of batches in raw_val_ds: 229
Number of batches in raw_test_ds: 229


In [40]:
# It's important to take a look at your raw data to ensure your normalization
# and tokenization will work as expected. We can do that by taking a few
# examples from the training set and looking at them.
# This is one of the places where eager execution shines:
# we can just evaluate these tensors using .numpy()
# instead of needing to evaluate them in a Session/Graph context.
# It's important to take a look at your raw data to ensure your normalization
# and tokenization will work as expected. We can do that by taking a few
# examples from the training set and looking at them.
# This is one of the places where eager execution shines:
# we can just evaluate these tensors using .numpy()
# instead of needing to evaluate them in a Session/Graph context.
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print('Features: {}, Target: {}'.format(text_batch.numpy()[i], label_batch.numpy()[i]))

Features: b'I accidentally killed an 87 day snap streak and now I wanna accidentally fall off a cliff ????????????????????', Target: 1
Features: b'5 Seconds of Summer Is my pick for http://t.co/qcHV3JqOVK Fan Army #5SOSFAM http://t.co/gc0uDfnFgg  \xc3\x8c\xc3\x911', Target: 0
Features: b'https://t.co/4i0rKcbK1D\nSON OF SAVIOR LAVA VIDEO', Target: 0
Features: b'WFP - WFP Delivers Food To 165000 Bangladesh Flood Victims After Tropical Cyclone Komen: DHAKA \xc2\x89\xc3\x9b\xc3\x92The United Na... http://t.co/fukbBeDfGx', Target: 1
Features: b'Cross-border terrorism: Pakistan caught red-handed again - The Times of India http://t.co/uiqsfgZoOx', Target: 1


In [41]:
# Having looked at our data above, we see that the raw text contains HTML break
# tags of the form '<br />'. These tags will not be removed by the default
# standardizer (which doesn't strip HTML). Because of this, we will need to
# create a custom standardization function.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "(?P<url>https?://[^\s]+)", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )


# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Now that the vocab layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [42]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [43]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [47]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x93de8510>

In [48]:
model.evaluate(test_ds)




[1.3379831314086914, 0.7469351887702942]

In [49]:
for x in train_ds.take(1):
    print(x)

(<tf.Tensor: shape=(5, 500), dtype=int64, numpy=
array([[    8,  2127,   119, ...,     0,     0,     0],
       [  163,  1655,     6, ...,     0,     0,     0],
       [ 1883,     6,  7437, ...,     0,     0,     0],
       [ 3556,  3556,  1172, ...,     0,     0,     0],
       [12350,   481,   907, ...,     0,     0,     0]], dtype=int64)>, <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 1, 1], dtype=int64)>)
