In [46]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from keras import layers, models, utils, preprocessing


# Load and preprocess the data
with open("categorical_data.json", "r") as file:
    data = json.load(file)

# Tokenize item names
tokenizer = preprocessing.text.Tokenizer(filters='')
texts = [item["c1"] for item in data]
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [47]:

max_len = max(len(seq) for seq in sequences)

padded_sequences = utils.pad_sequences(sequences, maxlen=max_len, padding="post")

# Convert category labels to numerical form
labels = [item["c0"] for item in data]
label_map = {label: i for i, label in enumerate(set(labels))}

In [48]:
print(labels)
print(label_map)

['PM', 'CM', 'Tool', 'CM', 'PM', 'Tool', 'PM', 'PM', 'PM', 'CM', 'CM', 'CM', 'Tool', 'Tool', 'Tool', 'Tool', 'Tool', 'Tool', 'Tool', 'PM', 'PM', 'CM', 'Tool', 'Tool', 'Tool', 'Tool', 'PM', 'CM', 'CM', 'CM', 'CM', 'CM', 'PM', 'PM', 'CM', 'CM', 'CM', 'CM', 'Tool', 'Tool', 'Tool', 'Tool', 'Tool', 'CM', 'CM', 'CM', 'Tool', 'Tool', 'Tool', 'Tool', 'Tool', 'PM', 'PM', 'CM', 'CM', 'CM', 'PM', 'PM', 'PM', 'CM', 'Tool', 'CM', 'CM', 'CM', 'Tool', 'Tool', 'CM', 'CM', 'PM', 'Tool', 'PM', 'CM', 'CM', 'PM', 'PM', 'PM', 'PM', 'PM', 'PM', 'PM', 'PM', 'PM', 'PM', 'CM', 'CM', 'Tool', 'Tool', 'CM', 'PM', 'PM', 'Tool', 'PM', 'PM', 'PM', 'PM', 'CM', 'CM', 'CM', 'CM', 'Tool', 'Tool', 'Tool', 'Tool', 'CM', 'PM', 'PM', 'PM', 'PM', 'PM', 'CM', 'CM', 'CM', 'CM', 'PM', 'CM', 'CM', 'PM', 'PM', 'PM', 'PM', 'CM', 'Tool', 'PM', 'PM', 'CM', 'CM', 'CM', 'CM', 'Tool', 'PM', 'PM', 'PM', 'PM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'CM', 'PM', 'PM', 'PM', 'CM', 'CM', 'CM', 'PM

In [49]:
for (index, label) in label_map.items():
    print(index,label)

Tool 0
CM 1
PM 2


In [50]:


numerical_labels = np.array([label_map[label] for label in labels])
numerical_labels

array([2, 1, 0, ..., 0, 0, 0])

In [51]:

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(padded_sequences, numerical_labels, test_size=0.3, random_state=32)


In [84]:
len(padded_sequences)

14729

In [97]:
# Define the TensorFlow model
model = models.Sequential(
    [
        layers.Embedding(
            input_dim=len(padded_sequences), output_dim=32, input_length=max_len
        ),
        layers.GlobalAveragePooling1D(),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.3),
        layers.Dense(16, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(len(label_map), activation="softmax"),
    ]
)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 32, 32)            471328    
                                                                 
 global_average_pooling1d_4   (None, 32)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_16 (Dense)            (None, 32)                1056      
                                                                 
 dropout_9 (Dropout)         (None, 32)                0         
                                                                 
 dense_17 (Dense)            (None, 16)                528       
                                                                 
 dropout_10 (Dropout)        (None, 16)                0         
                                                      

In [98]:
# print(train_texts.shape)
print(train_labels.shape)
print(train_texts.shape)

(10310,)
(10310, 32)


In [99]:
train_labels.shape

(10310,)

In [100]:

history = model.fit(train_texts, train_labels, epochs=50, batch_size=3, validation_data=(val_texts, val_labels))

# Save the trained model
model.save("model_tensor_output")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


2024-05-06 23:25:20.024111: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2024-05-06 23:25:20.030805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,16]
	 [[{{node inputs}}]]
2024-05-06 23:25:20.116219: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2024-05-06 23:25:20

INFO:tensorflow:Assets written to: model_tensor_output/assets


In [101]:
newData = ["4x4x16 #2 yellow pin"]
# Tokenize item names
tokenizer_p = preprocessing.text.Tokenizer(filters='')
texts_p = [item for item in newData]
tokenizer_p.fit_on_texts(texts_p)
sequences_p = tokenizer_p.texts_to_sequences(texts_p)
padded_sequences_p = utils.pad_sequences(sequences_p, maxlen=max_len, padding="post")

padded_sequences_p.shape

# padded_sequences_p = utils.pad_sequences(sequences_p, maxlen=max_len, padding="post")

predictions = model.predict(padded_sequences_p)

print(predictions)

# For each prediction, you can find the index of the class with the highest probability
# predicted_labels = [np.argmax(pred) for pred in predictions]

# Then you can use this index to look up the actual label in your label map
# actual_labels = [label_map[i] for i in predicted_labels]
# print(predicted_labels)
# print(actual_labels)

[[1.1363174e-13 1.9908803e-04 9.9980098e-01]]
