In [1]:
import os
import pandas as pd
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import Optional
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.models import load_model
from transformers import AutoTokenizer
from scipy.sparse import hstack
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertModel
import h5py
import pickle
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
print("Importing...")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Importing...


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun if not recognized


def remove_extra_new_lines(text):
    if pd.isnull(text):  # check if text is nan
        return ''  # replace with an empty string

    clean_text = [i for i in str(text).splitlines() if i.strip()]
    clean_text = ' '.join(clean_text)
    return clean_text


def remove_extra_whitespace(text: str) -> str:
    spaceless_text = re.sub(r'\s+', ' ', text)
    return spaceless_text


def remove_special_chars(text: str, remove_digits: Optional[bool] = False) -> str:
    if remove_digits:
        pattern = r'[^a-zA-Z\s]'
    else:
        pattern = r'[^a-zA-Z0-9\s]'

    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text


def normalize_text(text):
    text = remove_extra_new_lines(text)

    text = remove_extra_whitespace(text)

    text = remove_special_chars(text, remove_digits=False)

    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    tagged_tokens = pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    return ' '.join(lemmas)

In [7]:
categorical_columns = ['type', 'manufacturer']

In [8]:
user_input = {
    "name": "LoDuca Bros Inc - Professional Digital Photo Studio Kit - Black/White/Blue",
    "description": "LODUCA BROS INC Professional Digital Photo Studio Kit: Lets you take professional-quality photos; includes 2 high-output tabletop lights, a 16 cubed soft-lighting frame and an adjustable mini tabletop tripod; multicompartment, padded carrying case",
    "price": 49.99,
    "type": "HardGood",
    "manufacturer": "LoDuca Bros Inc",
}

### Predic 1: 'parent_category'

In [9]:
# 'model_1_preberttune.h5'
model_1 = load_model('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/models/model_1_preberttune.h5')
# 'encoder.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/encoder/encoder.pkl', 'rb') as file:
    encoder = pickle.load(file)
# 'scaler.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/scaler/scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)
# 'label_encoder.h5'
with h5py.File('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/label encoder/label_encoder.h5', 'r') as hf:
    label_encoder_classes = hf['label_encoder'][:]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [10]:
def extract_last_hidden_state(embeddings):
    return embeddings[:, -1, :]

def tokenize_and_get_embeddings(column):
    # Tokenize text
    if isinstance(column, str):
        tokenized_text = tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    elif isinstance(column, list) and all(isinstance(item, str) for item in column):
        tokenized_text = tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    else:
        raise ValueError("Invalid input format.")

    outputs = model(tokenized_text)
    embeddings = outputs.last_hidden_state.numpy()

    return embeddings

def prepare_input(user_input, scaler, encoder, categorical_columns):

    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])

    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)
    scaled_price = scaler.transform([[user_input['price']]])[0, 0]

    encoded_user_input = [[user_input[column]] for column in categorical_columns]
    encoded_user_input = np.array(encoded_user_input).reshape(1, -1)
    encoded_categorical_features = encoder.transform(encoded_user_input)
    print(encoded_categorical_features.shape)

    combined_features = hstack([encoded_categorical_features, np.array([[scaled_price]])])
    print(combined_features.shape)
    num_cat_input_array = combined_features.toarray().astype(np.float32)
    final_input_array = np.concatenate((num_cat_input_array,extracted_name_hidden,extracted_description_hidden), axis=1)

    return extracted_name_hidden, extracted_description_hidden, final_input_array, scaled_price

In [11]:
extracted_name_hidden, extracted_description_hidden, final_input_array, scaled_price = prepare_input(user_input, scaler, encoder, categorical_columns)

(1, 2195)
(1, 2196)




In [12]:
def predict_model_1(final_input_array, model_1):
    predictions = model_1.predict(final_input_array)
    subcategory_pred_labels = np.argmax(predictions, axis=1)
    print(predictions)
    print(subcategory_pred_labels)
    return subcategory_pred_labels

def compare_predictions(subcategory_pred_labels, label_encoder_classes):
    predicted_labels = []
    for idx in subcategory_pred_labels:
        if idx < len(label_encoder_classes):
            predicted_labels.append(label_encoder_classes[idx])
        else:
            predicted_labels.append('unknown')
    return predicted_labels

In [13]:
predicted_labels = predict_model_1(final_input_array, model_1)
predicted_labels_2 = compare_predictions(predicted_labels, label_encoder_classes)

[[1.0994968e-04 7.9649151e-04 1.2140860e-03 5.3583179e-02 1.4598008e-03
  2.5725579e-03 2.2873802e-03 1.0619973e-01 4.3067613e-04 3.0624733e-04
  1.6319037e-04 8.6571947e-03 3.8944132e-04 9.3414586e-05 1.3874416e-04
  2.9502739e-03 7.7835256e-01 5.2608363e-04 9.0454130e-05 3.6915749e-02
  9.8234147e-04 7.8012527e-04 1.0003197e-03]]
[16]


In [14]:
print(predicted_labels_2)

[b'Musical Instruments']


### Predic 2: 'sub_category_1'

In [15]:
# 'model_2_preberttune.h5'
model_2 = load_model('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/models/model_2_preberttune.h5')
# decode previous output of prediction 1
decoded_labels = [label.decode() for label in predicted_labels_2]
# 'encoder_1.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/encoder/encoder_1.pkl', 'rb') as file:
    encoder_1 = pickle.load(file)
# 'scaler_1.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/scaler/scaler_1.pkl', 'rb') as file:
    scaler_1 = pickle.load(file)


def prepare_input_2(user_input, encoder_1, decoded_labels, scaler_1):


    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])
    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)

    input_data = np.array([[user_input['type'], user_input['manufacturer']]])
    predicted_labels_array = np.array(decoded_labels)[:3].reshape(1, -1)
    input_with_labels = np.hstack((input_data, predicted_labels_array))


    predicted_labels_one_hot = encoder_1.transform(input_with_labels)


    scaled_price_array = scaler_1.transform(np.array(user_input['price']).reshape(-1, 1))

    final_input_array_with_label = np.hstack((predicted_labels_one_hot.toarray(), scaled_price_array))
    final_input_array_2 = np.concatenate((final_input_array_with_label, extracted_name_hidden, extracted_description_hidden), axis=1)

    return final_input_array_2

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
final_input_array_with_label_1 = prepare_input_2(user_input, encoder_1, decoded_labels, scaler_1)



In [17]:
def predict_model_2(final_input_array_with_label_1, model_2):
    predictions_1 = model_2.predict(final_input_array_with_label_1)
    subcategory_pred_labels = np.argmax(predictions_1, axis=1)
    print(predictions_1)
    print(subcategory_pred_labels)
    return subcategory_pred_labels

def compare_predictions_2(subcategory_pred_labels, label_encoder_classes_1):
    predicted_labels = []
    for idx in subcategory_pred_labels:
        if idx < len(label_encoder_classes_1):
            predicted_labels.append(label_encoder_classes_1[idx])
        else:
            predicted_labels.append('unknown')
    return predicted_labels

In [18]:
final_input_array_with_label_1 = prepare_input_2(user_input, encoder_1, decoded_labels, scaler_1)
subcategory_pred_labels = predict_model_2(final_input_array_with_label_1, model_2)
predicted_labels_3 = compare_predictions_2(subcategory_pred_labels, label_encoder_classes_1)

[[1.63322730e-10 9.24247224e-03 3.52612830e-07 7.30372633e-08
  4.35859349e-08 7.08782750e-07 6.87059062e-03 4.09882887e-06
  1.12313048e-08 3.46122597e-09 1.76686399e-09 2.39322901e-07
  5.70962388e-09 1.80817409e-07 6.87435772e-07 1.27569191e-08
  3.55181925e-07 1.35334830e-07 2.00965715e-08 2.15664699e-07
  2.29864156e-07 3.34467423e-08 2.48789860e-08 5.68401374e-05
  7.94910375e-05 7.86217740e-08 1.82330087e-02 5.10007752e-08
  1.19414352e-07 2.87019777e-08 4.65168704e-08 4.69537476e-08
  5.82247600e-03 1.75301966e-06 5.02646287e-07 7.00657381e-07
  1.50596432e-03 4.69041822e-07 2.50505946e-06 1.17480567e-08
  1.11170471e-06 7.92985873e-08 1.35558439e-05 5.65360324e-07
  1.64734456e-03 3.64446691e-07 3.41789018e-06 7.12557338e-08
  4.79635275e-07 5.17396393e-06 1.27256430e-06 7.21144886e-07
  4.11295076e-07 8.51561141e-04 2.08846610e-02 2.78479860e-07
  8.94914422e-07 1.16796059e-07 1.33506688e-07 4.09294665e-02
  1.15968897e-08 9.35552578e-08 1.35986173e-08 4.83558495e-07
  7.2404



In [19]:
print(predicted_labels_3)

[b' Musical Instrument Accessories']


### Predic 3: 'sub_category_2'

In [20]:
# 'model_3_preberttune.h5'
model_3 = load_model('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/models/model_3_preberttune.h5')
# decode previous output of prediction 2
decoded_labels_2 = [label.decode() for label in predicted_labels_3]
# 'encoder_2.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/encoder/encoder_2.pkl', 'rb') as file:
    encoder_2 = pickle.load(file)
# 'scaler_2.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/scaler/scaler_2.pkl', 'rb') as file:
    scaler_2 = pickle.load(file)
# 'label_encoder_2.h5'
with h5py.File('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/label encoder/label_encoder_2.h5', 'r') as hf:
    label_encoder_classes_2 = hf['label_encoder_2'][:]

def prepare_input_3(user_input, encoder_2, decoded_labels, decoded_labels_2, scaler_2):


    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])
    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)

    input_data = np.array([[user_input['type'], user_input['manufacturer']]])
    predicted_labels_array = np.array(decoded_labels)[:3].reshape(1, -1)
    predicted_labels_array_2 = np.array(decoded_labels_2)[:4].reshape(1, -1)
    input_with_labels = np.hstack((input_data, predicted_labels_array, predicted_labels_array_2))


    predicted_labels_one_hot = encoder_2.transform(input_with_labels)


    scaled_price_array = scaler_2.transform(np.array(user_input['price']).reshape(-1, 1))

    final_input_array_with_label = np.hstack((predicted_labels_one_hot.toarray(), scaled_price_array))
    final_input_array_3 = np.concatenate((final_input_array_with_label, extracted_name_hidden, extracted_description_hidden), axis=1)

    return final_input_array_3

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [21]:
final_input_array_with_label_2 = prepare_input_3(user_input, encoder_2, decoded_labels, decoded_labels_2, scaler_2)



In [22]:
def predict_model_3(final_input_array_with_label_2, model_3):
    predictions_2 = model_3.predict(final_input_array_with_label_2)
    subcategory_pred_labels_2 = np.argmax(predictions_2, axis=1)
    print(predictions_2)
    print(subcategory_pred_labels_2)
    return subcategory_pred_labels_2

def compare_predictions_3(subcategory_pred_labels_2, label_encoder_classes_2):
    predicted_labels_4 = []
    for idx in subcategory_pred_labels_2:
        if idx < len(label_encoder_classes_2):
            predicted_labels_4.append(label_encoder_classes_2[idx])
        else:
            predicted_labels_4.append('unknown')
    return predicted_labels_4

In [23]:
subcategory_pred_labels_2 = predict_model_3(final_input_array_with_label_2, model_3)
predicted_labels_5 = compare_predictions_2(subcategory_pred_labels_2, label_encoder_classes_2)

[[1.12019552e-06 1.13052090e-06 2.28617523e-06 1.66467203e-06
  3.97478580e-05 2.34333293e-05 1.73665001e-04 4.61353702e-05
  2.00526702e-05 1.33288677e-05 7.43866758e-06 7.67152869e-07
  6.40058033e-08 3.17638069e-06 1.27486419e-04 3.41650463e-07
  8.61173188e-10 2.25794633e-06 8.29954515e-05 6.56466881e-10
  8.13052594e-08 2.40425805e-07 2.50137532e-07 3.96551094e-08
  1.79948056e-05 1.91063009e-06 1.38132627e-05 2.51572255e-05
  3.46558352e-07 2.56160274e-05 5.34442916e-06 1.01338571e-06
  1.11924726e-06 7.19470336e-05 5.63906553e-07 6.37364923e-04
  2.68000131e-05 2.10973269e-07 9.32760315e-07 1.18801909e-05
  6.34380115e-07 7.57125918e-08 1.40449138e-05 4.46870551e-03
  1.81001560e-05 9.31837076e-06 2.63293913e-07 8.09406675e-09
  8.93142442e-06 3.36447641e-07 1.07381766e-06 5.51221601e-05
  1.15474954e-08 2.89647105e-05 2.41307293e-06 8.48824754e-07
  1.92448806e-06 1.20558934e-06 7.17551643e-07 5.48925209e-06
  1.78887944e-08 2.16770495e-08 2.70806072e-06 1.17872993e-07
  7.1532

In [24]:
print(predicted_labels_5)

[b'missing']


### Predic 4: 'sub_category_3'

In [25]:
# 'model_4_preberttune.h5'
model_4 = load_model('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/models/model_4_preberttune.h5')
# decode previous output of prediction 3
decoded_labels_3 = [label.decode() for label in predicted_labels_5]
# 'encoder_3.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/encoder/encoder_3.pkl', 'rb') as file:
    encoder_3 = pickle.load(file)
# 'scaler_3.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/scaler/scaler_3.pkl', 'rb') as file:
    scaler_3 = pickle.load(file)
# 'label_encoder_3.h5'
with h5py.File('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/label encoder/label_encoder_3.h5', 'r') as hf:
    label_encoder_classes_3 = hf['label_encoder_3'][:]

def prepare_input_4(user_input, encoder_3, decoded_labels, decoded_labels_2, decoded_labels_3, scaler_3):


    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])
    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)

    input_data = np.array([[user_input['type'], user_input['manufacturer']]])
    predicted_labels_array = np.array(decoded_labels)[:3].reshape(1, -1)
    predicted_labels_array_2 = np.array(decoded_labels_2)[:4].reshape(1, -1)
    predicted_labels_array_3 = np.array(decoded_labels_3)[:5].reshape(1, -1)
    input_with_labels = np.hstack((input_data, predicted_labels_array, predicted_labels_array_2, predicted_labels_array_3))


    predicted_labels_one_hot = encoder_3.transform(input_with_labels)


    scaled_price_array = scaler_3.transform(np.array(user_input['price']).reshape(-1, 1))

    final_input_array_with_label = np.hstack((predicted_labels_one_hot.toarray(), scaled_price_array))
    final_input_array_4 = np.concatenate((final_input_array_with_label, extracted_name_hidden, extracted_description_hidden), axis=1)

    return final_input_array_4

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [26]:
final_input_array_with_label_3 = prepare_input_4(user_input, encoder_3, decoded_labels, decoded_labels_2, decoded_labels_3, scaler_3)



In [27]:
def predict_model_4(final_input_array_with_label_3, model_4):
    predictions_3 = model_4.predict(final_input_array_with_label_3)
    subcategory_pred_labels_3 = np.argmax(predictions_3, axis=1)
    print(predictions_3)
    print(subcategory_pred_labels_3)
    return subcategory_pred_labels_3

def compare_predictions_3(subcategory_pred_labels_3, label_encoder_classes_3):
    predicted_labels_6 = []
    for idx in label_encoder_classes_3:
        if idx < len(label_encoder_classes_3):
            predicted_labels_6.append(label_encoder_classes_3[idx])
        else:
            predicted_labels_6.append('unknown')
    return predicted_labels_6

In [28]:
subcategory_pred_labels_3 = predict_model_4(final_input_array_with_label_3, model_4)
predicted_labels_7 = compare_predictions_2(subcategory_pred_labels_3, label_encoder_classes_3)

[[1.22693189e-10 1.73246389e-10 1.61119132e-10 5.56444668e-10
  2.25295310e-10 2.23974051e-13 3.55915436e-10 2.07534558e-08
  1.84050509e-06 1.07475469e-08 3.33443779e-08 2.27218067e-08
  4.77060023e-11 9.14231883e-15 1.19763585e-12 1.66595185e-13
  2.65424644e-11 4.40834903e-11 3.04344688e-10 1.49374380e-08
  2.28899510e-10 2.12920778e-10 2.03017914e-09 5.43238565e-10
  1.04970837e-07 1.55666147e-08 4.82114799e-11 6.73179637e-11
  8.42902192e-10 1.68191822e-10 6.20664381e-13 2.79229667e-10
  1.82920890e-09 2.64439790e-11 4.45659926e-10 1.16489574e-09
  9.34191405e-11 7.98367381e-08 3.70771053e-10 1.52366487e-11
  8.80752971e-10 1.66445843e-10 6.32769187e-14 4.14509005e-09
  3.07556633e-13 7.27202187e-11 8.21347417e-12 9.61033544e-11
  7.48349679e-14 5.05397529e-11 1.82542981e-10 2.10754591e-11
  1.42282011e-11 7.69992820e-11 2.20529519e-12 5.03492060e-07
  1.48841617e-09 1.46257811e-10 3.53652929e-08 1.48522006e-09
  1.14419021e-12 3.40197759e-10 1.15083351e-11 2.94900682e-09
  1.8729

In [29]:
print(predicted_labels_7)

[b'missing']


### Predict 5: 'sub_category_4'

In [30]:
# 'model_5_preberttune.h5'
model_5 = load_model('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/models/model_5_preberttune.h5')
# decode previous output of prediction 4
decoded_labels_4 = [label.decode() for label in predicted_labels_7]
# 'encoder_4.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/encoder/encoder_4.pkl', 'rb') as file:
    encoder_4 = pickle.load(file)
# 'scaler_4.pkl'
with open('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/scaler/scaler_4.pkl', 'rb') as file:
    scaler_4 = pickle.load(file)
# 'label_encoder_4.h5'
with h5py.File('/content/drive/MyDrive/prueba_encoder/predic_model_1/final/label encoder/label_encoder_4.h5', 'r') as hf:
    label_encoder_classes_4 = hf['label_encoder_4'][:]

def prepare_input_5(user_input, encoder_4, decoded_labels, decoded_labels_2, decoded_labels_3, decoded_labels_4, scaler_4):


    user_input['name'] = normalize_text(user_input['name'])
    user_input['description'] = normalize_text(user_input['description'])
    name_embeddings = tokenize_and_get_embeddings(user_input['name'])
    description_embeddings = tokenize_and_get_embeddings(user_input['description'])
    extracted_name_hidden = extract_last_hidden_state(name_embeddings)
    extracted_description_hidden = extract_last_hidden_state(description_embeddings)

    input_data = np.array([[user_input['type'], user_input['manufacturer']]])
    predicted_labels_array = np.array(decoded_labels)[:3].reshape(1, -1)
    predicted_labels_array_2 = np.array(decoded_labels_2)[:4].reshape(1, -1)
    predicted_labels_array_3 = np.array(decoded_labels_3)[:5].reshape(1, -1)
    predicted_labels_array_4 = np.array(decoded_labels_4)[:6].reshape(1, -1)
    input_with_labels = np.hstack((input_data, predicted_labels_array, predicted_labels_array_2, predicted_labels_array_3, predicted_labels_array_4))


    predicted_labels_one_hot = encoder_4.transform(input_with_labels)


    scaled_price_array = scaler_4.transform(np.array(user_input['price']).reshape(-1, 1))

    final_input_array_with_label = np.hstack((predicted_labels_one_hot.toarray(), scaled_price_array))
    final_input_array_5 = np.concatenate((final_input_array_with_label, extracted_name_hidden, extracted_description_hidden), axis=1)

    return final_input_array_5

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [31]:
final_input_array_with_label_4 = prepare_input_5(user_input, encoder_4, decoded_labels, decoded_labels_2, decoded_labels_3, decoded_labels_4, scaler_4)



In [32]:
def predict_model_5(final_input_array_with_label_4, model_5):
    predictions_4 = model_5.predict(final_input_array_with_label_4)
    subcategory_pred_labels_4 = np.argmax(predictions_4, axis=1)
    print(predictions_4)
    print(subcategory_pred_labels_4)
    return subcategory_pred_labels_4

def compare_predictions_4(subcategory_pred_labels_4, label_encoder_classes_4):
    predicted_labels_8 = []
    for idx in label_encoder_classes_4:
        if idx < len(label_encoder_classes_4):
            predicted_labels_8.append(label_encoder_classes_4[idx])
        else:
            predicted_labels_8.append('unknown')
    return predicted_labels_8

In [33]:
subcategory_pred_labels_4 = predict_model_4(final_input_array_with_label_4, model_5)
predicted_labels_9 = compare_predictions_2(subcategory_pred_labels_4, label_encoder_classes_4)



[[4.53251978e-08 2.55513233e-09 9.76024861e-09 2.58665450e-11
  3.81413727e-08 2.76196772e-08 7.91640850e-11 8.96482055e-11
  9.23418217e-11 1.93440073e-08 3.39417427e-09 4.16434830e-07
  8.83128237e-09 1.55097926e-08 5.09740528e-10 1.81329485e-09
  2.43397295e-07 9.67605502e-06 1.25408883e-09 1.27100497e-10
  1.49762425e-10 3.67749102e-08 9.17367085e-11 3.39476017e-11
  4.32860969e-11 3.76030833e-08 2.40257023e-08 1.73815684e-09
  6.51794267e-07 1.02076809e-08 1.05980034e-08 1.91903968e-08
  2.13211759e-09 1.87655439e-06 3.24516081e-10 7.73781306e-09
  1.08735094e-07 3.37322337e-10 8.47575066e-11 2.56696331e-09
  2.93560398e-09 9.89183135e-09 2.43740694e-10 2.09551682e-10
  1.49164392e-09 1.56300661e-08 1.19633597e-07 1.42755002e-10
  4.67189953e-10 1.57908728e-10 7.18361370e-10 1.92163929e-09
  8.32278080e-10 4.86628848e-10 2.37782007e-08 2.67114082e-08
  1.55833166e-10 1.32631666e-08 5.25088728e-09 1.11956111e-10
  5.77618531e-09 8.17259871e-10 2.30533392e-09 1.87989047e-09
  1.3946

In [34]:
print(predicted_labels_9)

[b'missing']
