In [1]:
# Import the libraries

import os
import shutil
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from tensorflow.keras import layers
import math
import matplotlib.pyplot as plt
import bert
import random
import numpy as np

tf.get_logger().setLevel('ERROR')

### Import the dataset

In [2]:
df_clean = pd.read_pickle('saved_objects/df_clean_2.pkl') # Read the input data

In [3]:
# changing the label 'other' = 0 and 'family_related' = 1
df_clean['label'] = label_binarize(df_clean['label'].to_numpy(), classes=['other', 'family_related'])

In [4]:
df_clean

Unnamed: 0,jokes_text,label
1,happens frog car break toad away,0
2,duck bought lipstick put bill,0
4,boy sell fish corner customer attention yell d...,0
5,witch team lose baseball game bat flew away,0
6,kangaroo jump high empire state building cours...,0
...,...,...
2415,easy count binary easy,0
2416,put phone airplane mode sure flyin,0
2417,many light bulb take change light bulb bright ...,0
2418,bill gate apple store fart apple fault window,0


In [5]:
jokes_text = df_clean['jokes_text'].to_list()
label = df_clean['label'].to_list()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(jokes_text, label, stratify=label, test_size=0.3) # 30 percent for testing

In [7]:
dataset = tf.data.Dataset.from_tensor_slices(([[text] for text in X_train], [[target] for target in y_train]))

In [8]:
test_data = tf.data.Dataset.from_tensor_slices(([[text] for text in X_test], [[target] for target in y_test]))

In [9]:
for features_tensor, target_tensor in dataset.take(1):
    print(f'features:{features_tensor} target:{target_tensor}')

features:[b'bad find cancer find curable'] target:[1]


### Tokenize Data

In [14]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [15]:
def tokenize_jokes(text_jokes):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_jokes))

In [16]:
tokenized_jokes = [tokenize_jokes(jokes) for jokes in jokes_text]

In [17]:
jokes_with_len = [[jokes, label[i], len(jokes)]
                 for i, jokes in enumerate(tokenized_jokes)]

In [18]:
random.shuffle(jokes_with_len)

In [19]:
jokes_with_len.sort(key=lambda x: x[2])

In [20]:
sorted_jokes_labels = [(jokes_lab[0], jokes_lab[1]) for jokes_lab in jokes_with_len]

In [21]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_jokes_labels, output_types=(tf.int32, tf.int32))

In [22]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [23]:
TOTAL_BATCHES = math.ceil(len(sorted_jokes_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

### CREATE MODEL ==> METHOD 1

In [24]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [25]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [26]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [28]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [30]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x176d10390>

In [31]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  6104400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0

In [33]:
results = text_model.evaluate(test_data)
print(results)

[0.7056457996368408, 0.7916666865348816]


### CREATE MODEL ==> METHOD 2

In [13]:
# Downloading small bert for fine tuning

tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [70]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [71]:
classifier_model = build_classifier_model()

In [72]:
classifier_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
preprocessing (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'sequence_output':  28763649    preprocessing[0][0]              
                                                                 preprocessing[0][1]              
                                                                 preprocessing[0][2]              
____________________________________________________________________________________________

In [73]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [74]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [75]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [76]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=dataset, epochs=epochs)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
results = classifier_model.evaluate(test_data)
print(results)

[1.1259980201721191, 0.8402062058448792]
