# Converting Scikit-learn Models to Tensorflow

We have trained a model in scikit previously. Now we replicate the same setup in tensorflow.


In [7]:
from tqdm.notebook import tqdm
import json
import re
import numpy as np 
import pandas as pd #
import tensorflow as tf
from tensorflow import keras
from keras.utils import pad_sequences
from keras import layers
from keras.preprocessing.text import Tokenizer
import tensorflow_constrained_optimization as tfco
import tensorflow_model_analysis as tfma
import fairness_indicators as fi
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from time import time
import re

# For kaggle only
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
tqdm.pandas()

if tf.__version__ < "2.0.0":
  tf.enable_eager_execution()
  print("Eager execution enabled.")
else:
  print("Eager execution enabled by default.")

print("TensorFlow " + tf.__version__)
print("FI " + fi.version.__version__)
print("TFMA " + tfma.VERSION_STRING)



Eager execution enabled by default.
TensorFlow 2.9.0
FI 0.44.0
TFMA 0.44.0


While the input is sentences, the classification is one of 4 labels. A model can only process numbers, therefore we need to convert these labels to ordinals.

For TS JS, we can piggyback on this pretrained model: https://github.com/tensorflow/tfjs-models/tree/master/universal-sentence-encoder. The Universal Sentence Encoder is a model that encodes text into 512-dimensional and uses an 8k word piece vocabulary.

We will do cleanup in python of the trained sentenses, and we'll try to find lemmatizers and stopword libraries in JS also.



In [8]:
# DATA = "/kaggle/input/linkedin/anonLinkedInProfiles.csv"
DATA = "./data/anonLinkedInProfiles.csv"
data = pd.concat([chunk for chunk in tqdm(pd.read_csv(DATA, chunksize=1000), desc=f'Loadin {DATA}')])
print(f'Shape: {data.shape}, does it have NAs:\n{data.isna().any()}')

DATA_LEN = 1400
data = data.sample(DATA_LEN, random_state=200)
data = data.reset_index() # Reset index, since we will do operations on it!
print(f'Resampled Shape: {data.shape}')

def _get_or_set_label(x):
    if x not in LABEL_DICT:
        LABEL_DICT[x] = len(LABEL_DICT)
    return LABEL_DICT[x]

LABEL_DICT = dict()
TRAIN_SIZE = int(DATA_LEN * 0.8)
VAL_SIZE = int((DATA_LEN-TRAIN_SIZE) * 0.3)
TEST_SIZE = int((DATA_LEN-TRAIN_SIZE) * 0.7)

X = data['titles'].astype(str) +  ' ' + data['descriptions'].astype(str)
Y = data['class'].apply(lambda x: _get_or_set_label(x))

X_train = X.iloc[TRAIN_SIZE:]
y_train = Y.iloc[TRAIN_SIZE:]

X_val = X.iloc[:VAL_SIZE]
y_val = Y.iloc[:VAL_SIZE]

X_test = X.iloc[VAL_SIZE:TEST_SIZE]
y_test = Y.iloc[VAL_SIZE:TEST_SIZE]


X_test.head()

Loadin ./data/anonLinkedInProfiles.csv: 0it [00:00, ?it/s]

Shape: (1497, 4), does it have NAs:
user            False
descriptions     True
titles           True
class           False
dtype: bool
Resampled Shape: (1400, 5)


84    senior business development manager at shrinkc...
85    software engineer creative and datadriven digi...
86    i am the ceo of cryptofy a platform that makes...
87    leading the charge on growth hacking and scali...
88    innovative web developer with expertise in dev...
dtype: object

As with scikit also with tensorflow, though we wish to use APIs that are portable to JS. We have to do some simple word preprocessing:

`String -> Token -> Lemma -> Embeddings`. 

Keras might solve this with their preprocessing.

In [9]:
import tensorflow as tf
import tensorflow_hub as hub
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
from nltk import pos_tag

nltk.download('all')
wnl = WordNetLemmatizer()
STOP_WORDS = stopwords.words('english')
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

hparams = {
    "batch_size": 128,
    "constraint_learning_rate": 0.01,
    "embedding_dim": 512, # from USE model
    "embedding_trainable": False,
    "learning_rate": 1e-2,
    "dropout_rate": 0.2,
    "max_num_words": 10000
}

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\bigworker\AppData\Roaming\nltk_data.

In [10]:
def clean_sentence(original_text):
  def _get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
  
  cleaned_text = original_text.strip()
  cleaned_text = original_text.translate(str.maketrans(' ', ' ', string.punctuation))
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', '\n')) # Remove newlines
  cleaned_text = cleaned_text.translate(str.maketrans(' ', ' ', string.digits)) # Remove digits
  cleaned_text = cleaned_text.lower() # Convert to lowercase
  cleaned_text = cleaned_text.split() # Split each sentence using delimiter

  lemmatized_list=[]
  for y in cleaned_text:
    if y in STOP_WORDS:
      continue
    z=wnl.lemmatize(y, _get_wordnet_pos(y))
    lemmatized_list.append(z)

  lemmatized_sentence = ' '.join(lemmatized_list)
  return lemmatized_sentence

def text_to_dataset(texts,labels):
    #embeddings = embed(texts)

    return tf.data.Dataset.from_tensor_slices((texts, labels.tolist())).cache().batch(hparams['batch_size']).prefetch(tf.data.AUTOTUNE)

X_test = X_test.progress_apply(lambda text: clean_sentence(text))
X_train = X_train.progress_apply(lambda text: clean_sentence(text))

train_ds = text_to_dataset(X_train, y_train)
test_ds = text_to_dataset(X_test, y_test)

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/280 [00:00<?, ?it/s]

Build the model. See: https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub_on_kaggle


In [58]:
model = keras.Sequential(
    [
        hub.KerasLayer(module_url,
                    output_shape=[hparams['embedding_dim']], 
                    input_shape=[],     
                    dtype=tf.string,
                    name="in_pretrained",
                    trainable=hparams['embedding_trainable']),
        layers.Dense(int(hparams['embedding_dim']/4), activation="sigmoid", name="layer2"),
        layers.Dropout(hparams['dropout_rate']),
        layers.Dense(len(LABEL_DICT), name="out"),
        layers.BatchNormalization(),
        layers.Softmax()
    ]
)

# Load the model and view a summary.
model.compile(
    loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.optimizers.Adam(learning_rate=hparams['learning_rate']), 
    metrics = [keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
               keras.metrics.MeanAbsoluteError(name='mean_absolute_error')
            ])

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 in_pretrained (KerasLayer)  (None, 512)               256797824 
                                                                 
 layer2 (Dense)              (None, 128)               65664     
                                                                 
 dropout_15 (Dropout)        (None, 128)               0         
                                                                 
 out (Dense)                 (None, 5)                 645       
                                                                 
 batch_normalization_12 (Bat  (None, 5)                20        
 chNormalization)                                                
                                                                 
 softmax_7 (Softmax)         (None, 5)                 0         
                                                      

Fit model

In [59]:
import tensorflow_addons as tfa

tqdm_callback = tfa.callbacks.TQDMProgressBar()

history = model.fit(train_ds, 
                    epochs=10, 
                    verbose = 10,
                    validation_data=(X_val, y_val), 
                    callbacks=[tqdm_callback]
                    )

history.history

Training:   0%|           0/10 ETA: ?s,  ?epochs/s

Epoch 1/10


0/3           ETA: ?s - 

Epoch 1/10
Epoch 2/10


0/3           ETA: ?s - 

Epoch 2/10
Epoch 3/10


0/3           ETA: ?s - 

Epoch 3/10
Epoch 4/10


0/3           ETA: ?s - 

Epoch 4/10
Epoch 5/10


0/3           ETA: ?s - 

Epoch 5/10
Epoch 6/10


0/3           ETA: ?s - 

Epoch 6/10
Epoch 7/10


0/3           ETA: ?s - 

Epoch 7/10
Epoch 8/10


0/3           ETA: ?s - 

Epoch 8/10
Epoch 9/10


0/3           ETA: ?s - 

Epoch 9/10
Epoch 10/10


0/3           ETA: ?s - 

Epoch 10/10


{'loss': [1.5898497104644775,
  1.43599534034729,
  1.352028250694275,
  1.2804293632507324,
  1.2536221742630005,
  1.2248784303665161,
  1.1945401430130005,
  1.1855881214141846,
  1.163711667060852,
  1.149172067642212],
 'accuracy': [0.2678571343421936,
  0.5464285612106323,
  0.7428571581840515,
  0.8642857074737549,
  0.8678571581840515,
  0.9035714268684387,
  0.9321428537368774,
  0.8999999761581421,
  0.9071428775787354,
  0.925000011920929],
 'mean_absolute_error': [1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508,
  1.9135713577270508],
 'val_loss': [1.5420663356781006,
  1.4622504711151123,
  1.3861982822418213,
  1.3244143724441528,
  1.2732881307601929,
  1.2303032875061035,
  1.1935789585113525,
  1.1624155044555664,
  1.1364467144012451,
  1.1161291599273682],
 'val_accuracy': [0.6785714030265808,
  0.7857142686843872,
  0.809

Move the model to TensorFlow JS.

In [60]:
print(LABEL_DICT)

job_titles = ["IT Consultant at Sesame Street, lord of Java Code, who likes to learn new stuff and tries some machine learning in my free engineering time."]
model.predict(job_titles)

{'bigbird': 0, 'count': 1, 'grover': 2, 'grouch': 3, 'erniebert': 4}


array([[0.25792286, 0.03674087, 0.50679684, 0.15116522, 0.04737413]],
      dtype=float32)