In [1]:
import tensorflow as tf
from tensorflow.keras import models, Model, mixed_precision
from tensorflow.keras.layers import *
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import pandas as pd
import json
import os, sys
import re

physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)
mixed_precision.set_global_policy('mixed_float16')
print(f'Running on Python {sys.version}, Tensorflow {tf.__version__}.')

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080 Ti, compute capability 8.6
Running on Python 3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)], Tensorflow 2.8.0-rc0.


In [2]:
# Load data
with open('1137582001-1137583000.json', 'r') as f:
    data = json.load(f)

def clean(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # clean unicode stuff
    # text = text.replace('\n', '<NEWLINE>')  # to see if without newline generated stuff works
    # text = re.sub(r'_+', ' _ ', text)  # replace all underscores with single underscore
    text = text.strip()
    return text


title = []
category = []
desc = []
img = []
data_loss_allowed = 0.8  # if cleaned data is less than 80% of length of original data, ditch it
for i, v in data.items():
    v['categories'] = ', '.join(v['categories'])  # expand cat list into string, see if works, if not, use one hot encoding but thats more complicated
    new_title = clean(v['name'])
    new_cat = clean(v['categories'])
    new_desc = clean(v['desc'])
    if len(new_title) > len(v['name'])*data_loss_allowed and len(new_cat) > len(v['categories'])*data_loss_allowed and len(new_desc) > len(v['desc'])*data_loss_allowed:
        title.append(new_title)
        category.append(new_cat)
        desc.append(new_desc)
        img.append(f'imgs/{i}.jpg')

title = np.array(title)
category = np.array(category)
desc = np.array(desc)
img = np.array(img)

In [3]:
# model
preprocessor = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
vocab_size = preprocessor.tokenize.get_special_tokens_dict()["vocab_size"].numpy()
preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='BERT_tokenizer')
desc = preprocessing_layer(desc)['input_word_ids']

title_input = Input(shape=(), name='title_input', dtype=tf.string)
category_input = Input(shape=(), name='category_input', dtype=tf.string)
# img_input = Input(shape=(224, 224, 3), name='img_input', dtype=tf.float32)

title_embeddings = preprocessing_layer(title_input)
category_embeddings = preprocessing_layer(category_input)
# extract image features

encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=False, name='BERT_encoder')
title_embeddings = encoder(title_embeddings)
title_embeddings = title_embeddings['sequence_output']
category_embeddings = encoder(category_embeddings)
category_embeddings = category_embeddings['sequence_output']
x = concatenate([title_embeddings, category_embeddings])
x = LSTM(512, return_sequences=True)(x)
x = LSTM(256, return_sequences=True)(x)
x = Dense(512, activation='relu')(x)
description_output = Dense(vocab_size, name='description_output', activation='softmax')(x)

model = Model(inputs=[title_input, category_input], outputs=[description_output])
plot_model(model, "model.png", show_shapes=True)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=[tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)],
    metrics=['accuracy'],
)
model.summary()
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0, patience=13, verbose=1,
                                     mode='auto', baseline=None, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='accuracy', factor=0.1, patience=10, verbose=1)
]

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 title_input (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 category_input (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 BERT_tokenizer (KerasLayer)    {'input_type_ids':   0           ['title_input[0][0]',            
                                (None, 128),                      'category_input[0][0]']         
                                 'input_word_ids':                                                
                                (None, 128),                                                  

In [4]:

history = model.fit({"title_input": title, "category_input": category}, {"description_output": desc}, validation_split=0.2,
                    batch_size=64, epochs=10, callbacks=callbacks, use_multiprocessing=True, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
title_to_pred = title[0]
category_to_pred = category[0]
print(title_to_pred, category_to_pred)
# title_to_pred = preprocessing_layer([title_to_pred])['input_word_ids']
# category_to_pred = preprocessing_layer([category_to_pred])['input_word_ids']
pred = model.predict({"title_input": np.array([title_to_pred]), "category_input": np.array([category_to_pred])})
pred = pred.squeeze()
pred = np.argmax(pred, axis=-1)

iPhone XS (512GB) READ BEFORE DM Mobile Phones & Gadgets, Mobile Phones, iPhone, iPhone X Series
