In [18]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [21]:
DATASET_CSV_FILENAME = "text_tags_188k.csv"

data = pd.read_csv(DATASET_CSV_FILENAME)
data = shuffle(data, random_state=22)

data.head()

Unnamed: 0,extracted_tags,original_tags,text
182914,"tensorflow,keras","tensorflow,keras,deep-learning,lstm,word-embed...",avocado image captioning model not compiling b...
48361,pandas,"python,pandas,flask",return excel file from avocado with flask in f...
181447,"tensorflow,keras","python,validation,tensorflow,keras,data-genera...",validating with generator (avocado) i'm trying...
66307,pandas,"python,pandas,dataframe",avocado multiindex dataframe selecting data gi...
11283,pandas,"python,python-3.x,pandas",get rightmost non-zero value position for each...


In [41]:
tags_split = [tags.split(",") for tags in data["extracted_tags"].values]

tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
num_tags = len(tags_encoded[0])

In [42]:
train_size = int(len(data) * .8)
print(f"Train size: {train_size}")
print(f"Test size: {len(data) - train_size}")

Train size: 150559
Test size: 37640


In [43]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [44]:
from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
    def __init__(self, vocab_size):
        self._vocab_size = vocab_size
        self._tokenizer = None
    
    def create_tokenizer(self, text_list):
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer

    def transform_text(self, text_list):
        text_matrix = self._tokenizer.texts_to_matrix(text_list)
        return text_matrix

In [45]:
train_qs = data["text"].values[:train_size]
test_qs = data["text"].values[train_size:]

VOCAB_SIZE = 400

processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [46]:
print(len(body_train[0]))
print(body_train[0])

400
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0

In [47]:
import pickle

with open("./processor_state.pkl", "wb") as f:
    pickle.dump(processor, f)

In [48]:
def create_model(vocab_size, num_tags):
    model = Sequential()
    model.add(Dense(50, input_shape=(vocab_size,), activation="relu"))
    model.add(Dense(25, activation="relu"))
    model.add(Dense(num_tags, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

model = create_model(vocab_size=VOCAB_SIZE, num_tags=num_tags)
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_29 (Dense)            (None, 50)                20050     
                                                                 
 dense_30 (Dense)            (None, 25)                1275      
                                                                 
 dense_31 (Dense)            (None, 5)                 130       
                                                                 
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________


In [51]:
model.fit(body_train, train_tags, epochs=10, batch_size=128, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x287b419d0>

In [53]:
model.evaluate(body_test, test_tags, batch_size=128)



[0.10386884957551956, 0.8950318694114685]

In [54]:
model.save("keras_saved_model.h5")

In [55]:
import pickle
import os
import numpy as np

class CustomModelPrediction(object):
    def __init__(self, model, processor):
        self._model = model
        self._processor = processor

    def predict(self, instances, **kwargs):
        preprocessed_data = self._processor.transform_text(instances)
        predictions = self._model.predict(preprocessed_data)
        return predictions.tolist()

    @classmethod
    def from_path(cls, model_dir):
        import tensorflow.keras as keras
        model = keras.models.load_model(os.path.join(model_dir, "keras_saved_model.h5"))
        with open(os.path.join(model_dir, "processor_state.pkl"), "rb") as f:
            processor = pickle.load(f)

        return cls(model, processor)


In [56]:
test_requests = [
    "How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error:",
    "Change the bar item name in Pandas I have a test excel file like:"
]

In [58]:
classifier = CustomModelPrediction.from_path(".")
results = classifier.predict(test_requests)
print(results)

for i in range(len(results)):
    print("Predicted labels:")
    for idx, val in enumerate(results[i]):
        if val > 0.7:
            print(tag_encoder.classes_[idx])
    print("\n")

[[0.3606712818145752, 6.416594987967983e-05, 0.004043933469802141, 0.0017825100803747773, 0.8948200941085815], [0.011447940021753311, 0.041343145072460175, 0.7622106671333313, 0.03102044016122818, 0.08442071825265884]]
Predicted labels:
tensorflow


Predicted labels:
pandas




2022-03-17 02:10:26.990840: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
