In [1]:
import os
import shutil

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

from sklearn import preprocessing
from transformers import AutoModel

tf.get_logger().setLevel('ERROR')



In [3]:
base_path = "../datasets/ecommerce"
df_full = pd.read_csv(f"{base_path}/sample_products.csv", header=None)

In [4]:
le = preprocessing.LabelEncoder()
df_full["target"] = le.fit_transform(df_full[0])

In [5]:
df_full

Unnamed: 0,0,1,target
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,3
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",3
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,3
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",3
4,Household,Incredible Gifts India Wooden Happy Birthday U...,3
...,...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,2
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,2
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,2
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",2


In [6]:
train, test = train_test_split(df_full, test_size=0.2, stratify=df_full[0])

In [7]:
# list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
list_classes = "target"

x_train = train[1]
y_train = train[list_classes]

x_test = test[1]
y_test = test[list_classes]

In [22]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)
x_test = np.asarray(x_test).astype('str')
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

In [10]:
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [11]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [13]:
text_test = x_train[:10]
text_preprocessed = bert_preprocess_model(text_test)
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')


Keys       : ['input_word_ids', 'input_mask', 'input_type_ids']
Shape      : (10, 128)
Word Ids   : [  101 25283  5092  6081  5371 19622  2007  2410 10306  1010  5047  1010]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')


Loaded BERT: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Pooled Outputs Shape:(10, 512)
Pooled Outputs Values:[ 0.972496    0.4619606  -0.24892116  0.2246686   0.5346749   0.9987924
  0.9808407  -0.7584635  -0.4017263  -0.71385735 -0.09148715 -0.9505471 ]
Sequence Outputs Shape:(10, 128, 512)
Sequence Outputs Values:[[ 0.8073148  -0.5198374  -0.02489168 ... -0.43454123 -1.1999257
   0.54391646]
 [ 0.75918365 -0.5762162   0.20485923 ... -0.322659   -0.08884247
  -0.05090211]
 [ 0.12971401 -0.7094554   0.7334735  ... -0.0250516  -0.7185406
   0.6864215 ]
 ...
 [ 0.11033924  0.04044639 -0.4643631  ... -0.09314542  1.0071619
   1.6363361 ]
 [ 0.5230737  -0.60592854  0.02337012 ...  0.23626448 -0.36064634
   0.92251474]
 [-0.00545413  0.51662517  0.04731714 ... -0.5927674   0.6675662
   1.2395232 ]]


In [26]:
num_classes = np.max(y_train) + 1
num_classes

4

In [27]:
def build_classifier_model(num_classes):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation="relu")(net)
    
    
    net = tf.keras.layers.Dense(num_classes, activation="softmax", name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [28]:
classifier_model = build_classifier_model(num_classes)
bert_raw_result = classifier_model(tf.constant(text_test))
np.argmax(bert_raw_result, axis=1)

2022-08-01 10:32:29.248523: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 62509056 exceeds 10% of free system memory.


array([2, 1, 0, 2, 1, 2, 2, 1, 2, 1])

In [57]:
classifier_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [48]:
epochs = 5
# steps_per_epoch = num_classes
# num_train_steps = steps_per_epoch * epochs
# num_warmup_steps = int(0.1*num_train_steps)

# init_lr = 3e-5
# optimizer = optimization.create_optimizer(init_lr=init_lr,
#                                           num_train_steps=num_train_steps,
#                                           num_warmup_steps=num_warmup_steps,
#                                           optimizer_type='adamw')


# metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
metrics = tf.keras.metrics.CategoricalAccuracy('accuracy')

classifier_model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

In [55]:
train_ds.take(1).element_spec

(TensorSpec(shape=(None,), dtype=tf.string, name=None),
 TensorSpec(shape=(None,), dtype=tf.int64, name=None))

In [56]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x_train, y_train,
                               # validation_data=test_ds,
                               callbacks=[callback],
                               epochs=epochs)


Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/5


ValueError: in user code:

    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/losses.py", line 1787, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/joker/.virtualenvs/tensorflow_lastest/lib/python3.9/site-packages/keras/backend.py", line 5119, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 4) are incompatible


In [None]:
dataset_name = 'toxic'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)



In [31]:
def print_my_examples(inputs, results):
    result_for_printing = \
    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
    print(*result_for_printing, sep='\n')
    print()


In [45]:
examples = [
    "anDisk Extreme 64GB CompactFlash Memory Card UDMA 7 Speed Up To 120MB/s 64GB Storage Capacity Read Speed up to 120 MB/s Write Speed up to 85 MB/s UDMA 7 Compliant The SanDisk 64 GB Extreme CompactFlash Memory Card provides fast, reliable photo and video capture. This card features a read speed of up to 120 MB/s and a write speed of up to 85 MB/s. Ultra Direct Memory Access 7 (UDMA-7) ensures optimal performance. When paired with a UDMA-compliant DSLR camera, this card guarantees fast, high-quality photo and video capture. Sustained Performance for Any Situation The optimal combination of shot speed (up to 85MB/s1) sustained video performance guarantee (VPG-20)3, and transfer speed (up to 120MB/s) Ideal for use with mid-range to high-end DSLR cameras and HD camcorders, the SanDisk Extreme CompactFlash Memory Card delivers first-rate read/write speeds to catch fast action shots and enable quick file transfers. This memory card features Video Performance Guarantee (VPG-20) to deliver a minimum sustained recording data rate of 20MB/s3 to support high-quality Full HD video (1080p)4 recording. Take advantage of burst-mode photography with the card's write speeds of up to 85MB/s1 (567X) and enjoy efficient workflow with its transfer speeds up to 120MB/s2. With capacities up to 128GB5, this memory card provides plenty of storage for Full HD videos and RAW photos. Exceptional Shot to Shot Performance With write speeds of up to 85MB/s1, the SanDisk Extreme CompactFlash Memory Card adds to your mid-range to-high-range DSLR's performance during burst-mode shooting, rapid shots, and RAW plus JPEG capture. The card records photos almost instantly, ensuring you will catch your best shot. Read speeds of up to 120MB/s2 make transferring images to your computer fast and simple. Professional-Grade Video Capture Featuring a Video Performance Guarantee (VPG-20)3 profile specification, the SanDisk Extreme CompactFlash memory card can keep up with the steep memory demands of professional video equipment such as HD camcorders",
    "Dell KB216 (HVG5J) Multimedia Keyboard (Black) Progress lives at the intersection of technology and humanity. Our connected world is undergoing its latest digital transformation-changing industries and creating fundamental shifts in the way we work and live."
]

print(classifier_model(tf.constant(examples)))

original_results = classifier_model(tf.constant(x_test[:50]))
predictions = np.argmax(original_results, axis=1)

print(le.inverse_transform(predictions))
le.inverse_transform(y_test[:50])

# print('Results from the model in memory:')
# print_my_examples(examples, original_results)


tf.Tensor(
[[0.14644575 0.12879334 0.30365846 0.42110246]
 [0.46102217 0.08755876 0.16546461 0.28595456]], shape=(2, 4), dtype=float32)
['Household' 'Household' 'Household' 'Household' 'Household' 'Household'
 'Household' 'Household' 'Household' 'Household' 'Household' 'Household'
 'Household' 'Household' 'Household' 'Household' 'Household' 'Electronics'
 'Household' 'Household' 'Household' 'Books' 'Books' 'Household' 'Books'
 'Books' 'Household' 'Household' 'Household' 'Books' 'Household'
 'Household' 'Household' 'Household' 'Household' 'Household' 'Household'
 'Household' 'Household' 'Household' 'Household' 'Electronics' 'Household'
 'Books' 'Clothing & Accessories' 'Household' 'Books' 'Household'
 'Household' 'Books']


array(['Household', 'Household', 'Household', 'Household', 'Household',
       'Household', 'Household', 'Household', 'Household', 'Electronics',
       'Clothing & Accessories', 'Clothing & Accessories', 'Household',
       'Household', 'Electronics', 'Household', 'Electronics',
       'Clothing & Accessories', 'Household', 'Clothing & Accessories',
       'Household', 'Household', 'Books', 'Household',
       'Clothing & Accessories', 'Books', 'Clothing & Accessories',
       'Electronics', 'Electronics', 'Books', 'Household', 'Household',
       'Household', 'Electronics', 'Electronics', 'Household',
       'Electronics', 'Electronics', 'Electronics', 'Household',
       'Household', 'Electronics', 'Household', 'Books', 'Household',
       'Household', 'Books', 'Household', 'Clothing & Accessories',
       'Books'], dtype=object)

In [None]:
# serving_results = reloaded_model \
#             .signatures['serving_default'](tf.constant(examples))

# serving_results = tf.sigmoid(serving_results['classifier'])

# print_my_examples(examples, serving_results)

In [None]:
/content/toxic_bert

In [None]:
import shutil
dest = "/gdrive/MyDrive/models/bert"

shutil.copytree(saved_model_path, dest)

'/gdrive/MyDrive/models/bert'

In [None]:
# max_seq_length = 128

# packer = tfm.nlp.layers.BertPackInputs(
#     seq_length=max_seq_length,
#     special_tokens_dict = tokenizer.get_special_tokens_dict())

In [None]:
# class BertInputProcessor(tf.keras.layers.Layer):
#   def __init__(self, tokenizer, packer):
#     super().__init__()
#     self.tokenizer = tokenizer
#     self.packer = packer

#   def call(self, inputs):
#     tok1 = self.tokenizer(inputs)

#     packed = self.packer(tok1)

#     return packed

In [None]:
# bert_inputs_processor = BertInputProcessor(tokenizer, packer)

# x_train_ds = bert_inputs_processor(x_train[:1000])
# example_inputs = next(iter(x_train_ds))

# example_inputs
# import json

# bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
# config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())
# config_dict
# encoder_config = tfm.nlp.encoders.EncoderConfig({
#     'type':'bert',
#     'bert': config_dict
# })

# bert_encoder = tfm.nlp.encoders.build_encoder(encoder_config)
# bert_encoder
# bert_classifier = tfm.nlp.models.BertClassifier(network=bert_encoder, num_classes=6)
# tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)

# x_train_ds
# bert_classifier(
#     x_train_ds, training=True).numpy()[:10]
# # y_cat_train = np.argmax(y_train.values, axis=1)
# # y_cat_test = np.argmax(y_test.values, axis=1)
# train_dataset = tf.data.Dataset.from_tensor_slices((train_data, y_train)).batch(64)
# # test_dataset = tf.data.Dataset.from_tensor_slices((test_data, y_test)).batch(64)
# bert_classifier, bert_encoder = classifier_model(bert_config, num_labels=6)
# checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
# checkpoint.read(
#     os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()
# # Set up epochs and steps
# epochs = 3
# batch_size = 32
# eval_batch_size = 32

# train_data_size = len(y_train)
# steps_per_epoch = int(train_data_size / batch_size)
# num_train_steps = steps_per_epoch * epochs
# warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# # creates an optimizer with learning rate schedule
# optimizer = nlp.optimization.create_optimizer(
#     2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)


# metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# bert_classifier.compile(
#     optimizer=optimizer,
#     loss=loss,
#     metrics=metrics)


# history = bert_classifier.fit(
#       train_data,
#       epochs=epochs)
# train_data
