# Converting Scikit-learn Models to Tensorflow

We have trained a model in scikit previously. Now we replicate the same setup in tensorflow.


In [48]:
from tqdm.notebook import tqdm
import json
import re
import numpy as np 
import pandas as pd #
import tensorflow as tf
from tensorflow import keras
from keras.utils import pad_sequences
from keras import layers
from keras.preprocessing.text import Tokenizer
import tensorflow_constrained_optimization as tfco
import tensorflow_model_analysis as tfma
import fairness_indicators as fi
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from time import time
import re

# For kaggle only
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
tqdm.pandas()

if tf.__version__ < "2.0.0":
  tf.enable_eager_execution()
  print("Eager execution enabled.")
else:
  print("Eager execution enabled by default.")

print("TensorFlow " + tf.__version__)
print("FI " + fi.version.__version__)
print("TFMA " + tfma.VERSION_STRING)

Eager execution enabled by default.
TensorFlow 2.9.0
FI 0.44.0
TFMA 0.44.0


While the input is sentences, the classification is one of 4 labels. A model can only process numbers, therefore we need to convert these labels to ordinals.

For TS JS, we can piggyback on this pretrained model: https://github.com/tensorflow/tfjs-models/tree/master/universal-sentence-encoder. The Universal Sentence Encoder is a model that encodes text into 512-dimensional and uses an 8k word piece vocabulary.

We will do cleanup in python of the trained sentenses, and we'll try to find lemmatizers and stopword libraries in JS also.



In [49]:
# DATA = "/kaggle/input/linkedin/anonLinkedInProfiles.csv"
DATA = "./data/anonLinkedInProfiles.csv"
DATA_LEN = 1400
LABEL_DICT = dict()

data = pd.concat([chunk for chunk in tqdm(pd.read_csv(DATA, chunksize=1000), desc=f'Loadin {DATA}')])
print(f'Shape: {data.shape}, does it have NAs:\n{data.isna().any()}')

data = data.sample(DATA_LEN, random_state=200)
data = data.reset_index() # Reset index, since we will do operations on it!
print(f'Resampled Shape: {data.shape}')

def _get_or_set_label(x):
    if x not in LABEL_DICT:
        LABEL_DICT[x] = len(LABEL_DICT)
    return LABEL_DICT[x]

X = data['titles'].astype(str) +  ' ' + data['descriptions'].astype(str)
Y = data['class'].apply(lambda x: _get_or_set_label(x)).astype(int)

X.head()

Loadin ./data/anonLinkedInProfiles.csv: 0it [00:00, ?it/s]

Shape: (1497, 4), does it have NAs:
user            False
descriptions     True
titles           True
class           False
dtype: bool
Resampled Shape: (1400, 5)


0    founder investor using technology to solve the...
1    embrace the way of the success samurai and con...
2    software engineer blackboulder  nyc game devel...
3    i'm a visionary in the world of decentralizati...
4    data scientist strategic and resultsdriven bus...
dtype: object

As with scikit also with tensorflow, though we wish to use APIs that are portable to JS. We have to do some simple word preprocessing:

`String -> Token -> Lemma -> Embeddings`. 

Keras might solve this with their preprocessing.

In [50]:
import nltk

nltk.download('all')

hparams = {
    "batch_size": 128,
    "embedding_dim": 512, # from USE model
    "embedding_trainable": True,
    "learning_rate": 1e-2,
    "dropout_rate": 0.2
}

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data.

In [51]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk import pos_tag

WNL = WordNetLemmatizer()
STOP_WORDS = stopwords.words('english')

def clean_sentence(original_text):
  def _get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
  
  cleaned_text = original_text.strip()
  cleaned_text = original_text.translate(str.maketrans(' ', ' ', string.punctuation))
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', '\n')) # Remove newlines
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', string.digits)) # Remove digits
  cleaned_text = cleaned_text.lower() # Convert to lowercase
  cleaned_text = cleaned_text.split() # Split each sentence using delimiter

  lemmatized_list=[]
  for y in cleaned_text:
    if y in STOP_WORDS:
      continue
    z=WNL.lemmatize(y, _get_wordnet_pos(y))
    lemmatized_list.append(z)

  lemmatized_sentence = ' '.join(lemmatized_list)
  return lemmatized_sentence

X_cleaned = X.progress_apply(lambda text: clean_sentence(text))

  0%|          | 0/1400 [00:00<?, ?it/s]

In [56]:
import tensorflow as tf
from tensorflow.data import Dataset, AUTOTUNE

def text_to_dataset(texts,labels):
    return Dataset.from_tensor_slices((texts, labels.tolist())).cache().batch(hparams['batch_size']).prefetch(AUTOTUNE)

VAL_SIZE = int(len(X)*0.3)


x_train = X_cleaned[VAL_SIZE:]
y_train = Y[VAL_SIZE:]

x_val = X_cleaned[:VAL_SIZE]
y_val = Y[:VAL_SIZE]

TEST_SIZE = int(len(x_val)*0.3)

x_test = x_val[TEST_SIZE:]
y_test = y_val[TEST_SIZE:]
x_val = x_val[:TEST_SIZE]
y_val = y_val[:TEST_SIZE]

train_ds = text_to_dataset(x_train, y_train)
val_ds = text_to_dataset(x_test, y_test)
test_ds = text_to_dataset(x_val, y_val)

# get top 5 elements of this prefectched slice.
list(train_ds.take(1))[0][1][:5]

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 1, 3, 3, 4])>

Build the model. See: https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub_on_kaggle


In [71]:
import tensorflow_hub as hub

USE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4"

model = keras.Sequential(
    [
        hub.KerasLayer(USE_URL,
                    output_shape=[hparams['embedding_dim']], 
                    input_shape=[],     
                    dtype=tf.string,
                    name="in_pretrained",
                    trainable=hparams['embedding_trainable']),
        layers.Dense(int(hparams['embedding_dim']/4), activation="sigmoid", name="layer2"),
        layers.Dropout(hparams['dropout_rate']),
        layers.BatchNormalization(),
        layers.Dense(len(LABEL_DICT), activation="softmax", name="out")
    ]
)

# Load the model and view a summary.
model.compile(
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.optimizers.Adam(learning_rate=hparams['learning_rate']), 
    metrics = [keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
               keras.metrics.MeanAbsoluteError(name='mean_absolute_error')
            ])

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 in_pretrained (KerasLayer)  (None, 512)               256797824 
                                                                 
 layer2 (Dense)              (None, 128)               65664     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 batch_normalization_7 (Batc  (None, 128)              512       
 hNormalization)                                                 
                                                                 
 out (Dense)                 (None, 5)                 645       
                                                                 
Total params: 256,864,645
Trainable params: 256,864,389
Non-trainable params: 256
______________________________________

Fit model

In [72]:
import tensorflow_addons as tfa

tqdm_callback = tfa.callbacks.TQDMProgressBar()

history = model.fit(train_ds, 
                    epochs=4, 
                    verbose = 10,
                    validation_data = val_ds,
                    callbacks=[tqdm_callback]
                    )

history.history

Training:   0%|           0/4 ETA: ?s,  ?epochs/s

Epoch 1/4


0/8           ETA: ?s - 

Epoch 1/4


  return dispatch_target(*args, **kwargs)


Epoch 2/4


0/8           ETA: ?s - 

Epoch 2/4
Epoch 3/4


0/8           ETA: ?s - 

Epoch 3/4
Epoch 4/4


0/8           ETA: ?s - 

Epoch 4/4


{'loss': [1.061897873878479,
  0.19000741839408875,
  0.12084926664829254,
  0.10100548714399338],
 'accuracy': [0.6255102157592773,
  0.9306122660636902,
  0.9561224579811096,
  0.9551020264625549],
 'mean_absolute_error': [1.9469386339187622,
  1.9469386339187622,
  1.9469386339187622,
  1.9469386339187622],
 'val_loss': [0.7433084845542908,
  0.48479005694389343,
  0.36169373989105225,
  0.33019354939460754],
 'val_accuracy': [0.8571428656578064,
  0.884353756904602,
  0.9251700639724731,
  0.9115646481513977],
 'val_mean_absolute_error': [1.9741495847702026,
  1.9741495847702026,
  1.9741495847702026,
  1.9741495847702026]}

Move the model to TensorFlow JS.

Evaluate on test data


[0.33681318163871765, 0.9285714030265808, 1.9158730506896973]

In [97]:
from math import floor

print("Evaluating test data")
print(model.evaluate(test_ds, batch_size=hparams['batch_size']))

job_titles = ["IT Consultant at Sesame Street, lord of Java Code, who likes to learn new stuff and tries some machine learning in my free engineering time."]

print("\nEvaluating new Description")
probas = model.predict(job_titles)[0]
print(LABEL_DICT)
print(probas)

max_proba_idx = np.argmax(probas)
print(f'\nPredicted character: [{list(LABEL_DICT)[max_proba_idx]}] with probability of: [{floor(probas[max_proba_idx]*100.0)}%]')

Evaluating test data
[0.33681318163871765, 0.9285714030265808, 1.9158730506896973]

Evaluating new Description
{'bigbird': 0, 'count': 1, 'grover': 2, 'grouch': 3, 'erniebert': 4}
[0.07306869 0.04236591 0.61606747 0.09085583 0.17764209]

Predicted character: [grover] with probability of: [61%]
