# Converting Scikit-learn Models to Tensorflow

We have trained a model in scikit previously. Now we replicate the same setup in tensorflow.


In [1]:
from tqdm.notebook import tqdm
import json
import re
import numpy as np 
import pandas as pd #
import tensorflow as tf
from tensorflow import keras
from keras.utils import pad_sequences
from keras import layers
from keras.preprocessing.text import Tokenizer
import tensorflow_constrained_optimization as tfco
import tensorflow_model_analysis as tfma
import fairness_indicators as fi
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from time import time
import re

# For kaggle only
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
tqdm.pandas()

if tf.__version__ < "2.0.0":
  tf.enable_eager_execution()
  print("Eager execution enabled.")
else:
  print("Eager execution enabled by default.")

print("TensorFlow " + tf.__version__)
print("FI " + fi.version.__version__)
print("TFMA " + tfma.VERSION_STRING)

Eager execution enabled by default.
TensorFlow 2.9.0
FI 0.44.0
TFMA 0.44.0


While the input is sentences, the classification is one of 4 labels. A model can only process numbers, therefore we need to convert these labels to ordinals.

For TS JS, we can piggyback on this pretrained model: https://github.com/tensorflow/tfjs-models/tree/master/universal-sentence-encoder. The Universal Sentence Encoder is a model that encodes text into 512-dimensional and uses an 8k word piece vocabulary.

We will do cleanup in python of the trained sentenses, and we'll try to find lemmatizers and stopword libraries in JS also.



In [2]:
# DATA = "/kaggle/input/linkedin/anonLinkedInProfiles.csv"
DATA = "./data/anonLinkedInProfiles.csv"
DATA_LEN = 1400
LABEL_DICT = dict()

data = pd.concat([chunk for chunk in tqdm(pd.read_csv(DATA, chunksize=1000), desc=f'Loadin {DATA}')])
print(f'Shape: {data.shape}, does it have NAs:\n{data.isna().any()}')

data = data.sample(DATA_LEN, random_state=200)
data = data.reset_index() # Reset index, since we will do operations on it!
print(f'Resampled Shape: {data.shape}')

def _get_or_set_label(x):
    if x not in LABEL_DICT:
        LABEL_DICT[x] = len(LABEL_DICT)
    return LABEL_DICT[x]

X = data['titles'].astype(str) +  ' ' + data['descriptions'].astype(str)
Y = data['class'].apply(lambda x: _get_or_set_label(x)).astype(int)

X.head()

Loadin ./data/anonLinkedInProfiles.csv: 0it [00:00, ?it/s]

Shape: (1497, 4), does it have NAs:
user            False
descriptions     True
titles           True
class           False
dtype: bool
Resampled Shape: (1400, 5)


0    founder investor using technology to solve the...
1    embrace the way of the success samurai and con...
2    software engineer blackboulder  nyc game devel...
3    i'm a visionary in the world of decentralizati...
4    data scientist strategic and resultsdriven bus...
dtype: object

As with scikit also with tensorflow, though we wish to use APIs that are portable to JS. We have to do some simple word preprocessing:

`String -> Token -> Lemma -> Embeddings`. 

Keras might solve this with their preprocessing.

In [3]:
import nltk

nltk.download('all')

hparams = {
    "batch_size": 64,
    "embedding_dim": 512, # from USE model
    "embedding_trainable": True,
    "learning_rate": 1e-2,
    "dropout_rate": 0.2
}

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data.

In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk import pos_tag

WNL = WordNetLemmatizer()
STOP_WORDS = stopwords.words('english')

def clean_sentence(original_text):
  def _get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
  
  cleaned_text = original_text.strip()
  cleaned_text = original_text.translate(str.maketrans(' ', ' ', string.punctuation))
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', '\n')) # Remove newlines
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', string.digits)) # Remove digits
  cleaned_text = cleaned_text.lower() # Convert to lowercase
  cleaned_text = cleaned_text.split() # Split each sentence using delimiter

  lemmatized_list=[]
  for y in cleaned_text:
    if y in STOP_WORDS:
      continue
    z=WNL.lemmatize(y, _get_wordnet_pos(y))
    lemmatized_list.append(z)

  lemmatized_sentence = ' '.join(lemmatized_list)
  return lemmatized_sentence

X_cleaned = X.progress_apply(lambda text: clean_sentence(text))

  0%|          | 0/1400 [00:00<?, ?it/s]

In [5]:
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE
import tensorflow_hub as hub

EMBED = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def text_to_dataset(texts,labels):
    return Dataset.from_tensor_slices((EMBED(texts), labels.tolist())).cache().batch(hparams['batch_size'],drop_remainder=True).prefetch(AUTOTUNE)

VAL_SIZE = int(len(X)*0.3)


x_train = X_cleaned[VAL_SIZE:]
y_train = Y[VAL_SIZE:]

x_val = X_cleaned[:VAL_SIZE]
y_val = Y[:VAL_SIZE]

TEST_SIZE = int(len(x_val)*0.3)

x_test = x_val[TEST_SIZE:]
y_test = y_val[TEST_SIZE:]
x_val = x_val[:TEST_SIZE]
y_val = y_val[:TEST_SIZE]

print(f'Sizes for TEST: {TEST_SIZE}, validation: {VAL_SIZE} and train: {len(X) - VAL_SIZE}')

train_ds = text_to_dataset(x_train, y_train)
val_ds = text_to_dataset(x_test, y_test)
test_ds = text_to_dataset(x_val, y_val)

# get top 5 elements of this prefectched slice.
list(test_ds.take(1))[0]

Sizes for TEST: 126, validation: 420 and train: 980


(<tf.Tensor: shape=(64, 512), dtype=float32, numpy=
 array([[-0.03083667, -0.0665146 , -0.0182992 , ...,  0.05561822,
         -0.04941908, -0.01916869],
        [ 0.02800515, -0.0724399 , -0.05752612, ..., -0.00541646,
          0.05067081, -0.07265382],
        [ 0.05944948,  0.02337807,  0.02523481, ...,  0.04628878,
          0.02467602, -0.03657639],
        ...,
        [-0.00581072, -0.07151613,  0.05340949, ..., -0.04404993,
         -0.06780335, -0.00589926],
        [ 0.00980827, -0.05763037, -0.0325496 , ...,  0.06339546,
         -0.06771517, -0.06418051],
        [ 0.01481714, -0.02670703,  0.04458027, ..., -0.01660948,
         -0.04708564, -0.0305433 ]], dtype=float32)>,
 <tf.Tensor: shape=(64,), dtype=int32, numpy=
 array([0, 1, 2, 3, 2, 3, 4, 4, 2, 0, 4, 3, 0, 3, 3, 4, 1, 3, 2, 3, 0, 2,
        1, 2, 2, 3, 2, 2, 0, 3, 1, 3, 3, 1, 0, 1, 1, 2, 3, 3, 1, 1, 0, 3,
        2, 2, 2, 2, 0, 0, 3, 3, 2, 2, 0, 4, 3, 3, 3, 1, 4, 4, 3, 4])>)

Build the model. See: https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub_on_kaggle


In [6]:
model = keras.Sequential(
    [
        layers.InputLayer(input_shape=(512,),batch_size=hparams['batch_size'], name="in_embeddings"),
        layers.Dense(int(hparams['embedding_dim']/4),batch_size=hparams['batch_size'], activation="sigmoid", name="layer1"),
        layers.Dropout(hparams['dropout_rate'],batch_size=hparams['batch_size'], name="drop"),
        layers.BatchNormalization(batch_size=hparams['batch_size']),
        layers.Dense(len(LABEL_DICT),batch_size=hparams['batch_size'], activation="softmax", name="out")
    ]
)

# Load the model and view a summary.
model.compile(
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.optimizers.Adam(learning_rate=hparams['learning_rate']), 
    metrics = [keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
               keras.metrics.MeanAbsoluteError(name='mean_absolute_error')
            ])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 layer1 (Dense)              (64, 128)                 65664     
                                                                 
 drop (Dropout)              (64, 128)                 0         
                                                                 
 batch_normalization (BatchN  (64, 128)                512       
 ormalization)                                                   
                                                                 
 out (Dense)                 (64, 5)                   645       
                                                                 
Total params: 66,821
Trainable params: 66,565
Non-trainable params: 256
_________________________________________________________________


Fit model

In [7]:
import tensorflow_addons as tfa

tqdm_callback = tfa.callbacks.TQDMProgressBar()

history = model.fit(train_ds, 
                    epochs=10, 
                    verbose = 10,
                    validation_data = val_ds,
                    callbacks=[tqdm_callback]
                    )

history.history


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.9.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Training:   0%|           0/10 ETA: ?s,  ?epochs/s

Epoch 1/10


0/15           ETA: ?s - 

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10


0/15           ETA: ?s - 

Epoch 2/10
Epoch 3/10


0/15           ETA: ?s - 

Epoch 3/10
Epoch 4/10


0/15           ETA: ?s - 

Epoch 4/10
Epoch 5/10


0/15           ETA: ?s - 

Epoch 5/10
Epoch 6/10


0/15           ETA: ?s - 

Epoch 6/10
Epoch 7/10


0/15           ETA: ?s - 

Epoch 7/10
Epoch 8/10


0/15           ETA: ?s - 

Epoch 8/10
Epoch 9/10


0/15           ETA: ?s - 

Epoch 9/10
Epoch 10/10


0/15           ETA: ?s - 

Epoch 10/10


{'loss': [0.9534054398536682,
  0.3047407865524292,
  0.21180962026119232,
  0.156783789396286,
  0.12977804243564606,
  0.09256439656019211,
  0.07765976339578629,
  0.07588458806276321,
  0.056048352271318436,
  0.05749953165650368],
 'accuracy': [0.6604166626930237,
  0.9020833373069763,
  0.9291666746139526,
  0.9416666626930237,
  0.9552083611488342,
  0.9739583134651184,
  0.9729166626930237,
  0.9739583134651184,
  0.9822916388511658,
  0.981249988079071],
 'mean_absolute_error': [1.9491668939590454,
  1.9491668939590454,
  1.9491668939590454,
  1.9491666555404663,
  1.9491668939590454,
  1.9491668939590454,
  1.9491666555404663,
  1.9491668939590454,
  1.9491668939590454,
  1.9491668939590454],
 'val_loss': [0.8772222995758057,
  0.709648847579956,
  0.6346638202667236,
  0.548626184463501,
  0.4619886875152588,
  0.41977015137672424,
  0.36388158798217773,
  0.3197943866252899,
  0.2798905074596405,
  0.2563992142677307],
 'val_accuracy': [0.74609375,
  0.734375,
  0.7421875,


Evaluate and move the model to TensorFlow JS.

In [8]:
from math import floor

print("Evaluating test data")
print(model.evaluate(test_ds, batch_size=hparams['batch_size']))

# Remember all our imputs need to be embedded first!
job_titles = EMBED(["IT Consultant at Sesame Street, lord of Java Code, who likes to learn new stuff and tries some machine learning in my free engineering time."])

print("\nEvaluating new Description")
probas = model.predict(job_titles)[0]
print(LABEL_DICT)
print(probas)

max_proba_idx = np.argmax(probas)
print(f'\nPredicted character: [{list(LABEL_DICT)[max_proba_idx]}] with probability of: [{floor(probas[max_proba_idx]*100.0)}%]')

Evaluating test data
[0.21909888088703156, 0.90625, 1.9562499523162842]

Evaluating new Description
{'bigbird': 0, 'count': 1, 'grover': 2, 'grouch': 3, 'erniebert': 4}
[0.00145273 0.00299333 0.9214264  0.07172145 0.00240595]

Predicted character: [grover] with probability of: [92%]


In [9]:
MODEL_PATH = 'models/tf'

# Need to prevent the input from requesting a batch size for now. Else we cannot do individual predictions in JS.
input_layer = layers.InputLayer(input_shape=(512,),batch_size=1, name="in_embeddings"),
model.layers[0] = input_layer

model.save(MODEL_PATH)

model = keras.models.load_model(MODEL_PATH)
model.predict(job_titles)

INFO:tensorflow:Assets written to: models/tf\assets


INFO:tensorflow:Assets written to: models/tf\assets




array([[0.00145273, 0.00299333, 0.9214264 , 0.07172145, 0.00240595]],
      dtype=float32)

What remains is:
```
tensorflowjs_converter \
    --input_format=tf_saved_model \
    models/tf \
    models/tfjs
```