<a href="https://colab.research.google.com/gist/absin1/b90f9eba8c0dec5d0e2391253df768ee/copy-of-transfer-learning-semantic-similarity-with-tf-hub-universal-encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


# Getting Started

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [1]:
# # Install the latest Tensorflow version.
# !pip3 install --quiet "tensorflow>=1.7"
# # Install TF-Hub.
# !pip3 install --quiet tensorflow-hub
# !pip3 install seaborn

More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

from bert_serving.client import BertClient


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
import psycopg2
import pandas.io.sql as sqlio
import numpy as np

def get_dataframe_sql():
    df = None
    sql = "select emotion as label, text_ as text from dataset_emotion_only"
    con = None
    try:
        con = psycopg2.connect("host='35.200.234.61' dbname='sales' user='postgres' password='cx6ac54nmgGtLD1y'")
        df = sqlio.read_sql_query(sql, con)
    except psycopg2.DatabaseError as e:
        if con:
            con.rollback()
        print(e)
        sys.exit(1)
    finally:
        if con:
            con.close()
    df = df.sample(frac=1.0)
    df.label = df.label.astype('category')
    return df
  
df = get_dataframe_sql()
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_test = df[~msk]
df_train.head()

Unnamed: 0,label,text
26915,worry,yup our coke blades b annnd now i only need th...
37746,happiness,having a cup of tea i have a cold so it's tast...
15211,worry,sucks about your cat... hope you guys feel better
8556,surprise,wow their is no pancake mix
28397,neutral,hey there what's up?


In [4]:
category_counts = len(df_train.label.cat.categories)
category_counts

14

In [5]:
bert_module = hub.Module("https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1", trainable=True)


In [6]:
bert_module.get_signature_names()

['mlm', 'tokens', 'tokenization_info']

In [7]:
embed_size=bert_module.get_output_info_dict(signature="tokens")['pooled_output'].get_shape()[1].value
embed_size

768

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [8]:
embedding = layers.Input(shape=(768,), dtype='float32')
#input_text = layers.Input(shape=(1,), dtype=tf.string)
#embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(category_counts, activation='sigmoid')(dense)
model = Model(inputs=[embedding], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

W0808 15:39:07.493130 140036893017920 deprecation_wrapper.py:119] From /home/chirag/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0808 15:39:07.494173 140036893017920 deprecation_wrapper.py:119] From /home/chirag/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0808 15:39:07.495821 140036893017920 deprecation_wrapper.py:119] From /home/chirag/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0808 15:39:07.513628 140036893017920 deprecation_wrapper.py:119] From /home/chirag/venv/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0808 15:39:07.529678 140036

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               196864    
_________________________________________________________________
dense_2 (Dense)              (None, 14)                3598      
Total params: 200,462
Trainable params: 200,462
Non-trainable params: 0
_________________________________________________________________


In [9]:
bc = BertClient(ip='192.168.0.101', check_length=False)

In [10]:
train_text = df_train['text'].tolist()
train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)
train_text_enc = bc.encode(train_text)

In [12]:
train_text_enc.shape

(31866, 768)

In [13]:
train_label[:3]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int8)

In [14]:
test_text = df_test['text'].tolist()
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)
test_text_enc = bc.encode(test_text)

In [15]:
test_text_enc.shape

(8134, 768)

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [16]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(train_text_enc, 
            train_label,
            validation_data=(test_text_enc, test_label),
            epochs=5,
            batch_size=32)
    model.save_weights('./model.h5')

W0808 15:48:46.041866 140036893017920 deprecation_wrapper.py:119] From /home/chirag/venv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 31866 samples, validate on 8134 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
!ls -alh | grep model.h5

-rw-r--r-- 1 chirag chirag 797K Aug  8 15:48 model.h5


## Make predictions

In [76]:
new_text =["The bottle is blue in color", "I hate you so much",  "I had an amazing day at the stadium", 
           "It was super fun after playing football", "my computer works fine", 
           "I was shocked when I heard the airplane got crashed",
           "What the fuck!!!!!!",
          "What is your name?",
          "I was surprised when she got a gold medal for India."]
new_text_enc = bc.encode(new_text)

In [77]:
new_text_enc.shape

(9, 768)

In [78]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights('./model.h5')  
    predicts = model.predict(new_text_enc, batch_size=32)

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7f5c68965208>>
Traceback (most recent call last):
  File "/home/chirag/venv/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')


In [79]:
predicts

array([[2.0350814e-03, 2.0157099e-03, 7.1240187e-02, 2.4720430e-03,
        2.6770711e-02, 1.2044600e-01, 3.2000571e-02, 3.2670200e-02,
        3.7681589e-01, 6.7488849e-02, 4.2224020e-02, 8.6188316e-05,
        9.1425061e-02, 7.5395077e-02],
       [1.3996959e-03, 9.3549490e-04, 6.8725049e-03, 1.4655590e-03,
        3.9370656e-03, 2.0048231e-02, 2.6898789e-01, 1.8154705e-01,
        8.0556512e-02, 1.1915535e-02, 2.5613678e-01, 0.0000000e+00,
        2.0895064e-02, 1.3145527e-01],
       [1.0221601e-03, 1.2023151e-03, 1.2288451e-02, 7.5846016e-03,
        2.7428955e-02, 5.6871986e-01, 7.5955987e-03, 9.8268420e-02,
        1.4171860e-01, 5.6668073e-02, 3.0237108e-02, 8.8721514e-05,
        1.4097255e-01, 8.9795619e-02],
       [6.8944693e-04, 3.6239624e-04, 5.9154034e-03, 6.1479807e-03,
        1.5680462e-01, 6.9085258e-01, 2.9958785e-03, 1.5326336e-01,
        7.6561719e-02, 9.0712190e-02, 1.1697590e-02, 2.3424625e-05,
        7.8079134e-02, 3.4933239e-02],
       [8.3343387e-03, 1.509

In [80]:
categories = df_train.label.cat.categories.tolist()
predict_logits = predicts.argmax(axis=1)
predict_labels = [categories[logit] for logit in predict_logits]
predict_labels

['neutral',
 'hate',
 'happiness',
 'happiness',
 'worry',
 'worry',
 'worry',
 'neutral',
 'worry']

In [81]:
threshold = 0.1
for i,sentence in enumerate(new_text):
    predict = predicts[i]
    print(sentence+'--->')
    for j, pred in enumerate(predict):
        if pred>threshold:
            print('\t'+categories[j]+'--->'+str(pred))

The bottle is blue in color--->
	happiness--->0.120446
	neutral--->0.3768159
I hate you so much--->
	hate--->0.2689879
	love--->0.18154705
	sadness--->0.25613678
	worry--->0.13145527
I had an amazing day at the stadium--->
	happiness--->0.56871986
	neutral--->0.1417186
	surprise--->0.14097255
It was super fun after playing football--->
	fun--->0.15680462
	happiness--->0.6908526
	love--->0.15326336
my computer works fine--->
	happiness--->0.14424214
	neutral--->0.14992711
	relief--->0.1169765
	worry--->0.3324223
I was shocked when I heard the airplane got crashed--->
	sadness--->0.13823363
	surprise--->0.24176374
	worry--->0.49479437
What the fuck!!!!!!--->
	surprise--->0.13577586
	worry--->0.3363986
What is your name?--->
	neutral--->0.6022959
	surprise--->0.3626613
	worry--->0.1577163
I was surprised when she got a gold medal for India.--->
	happiness--->0.15419748
	neutral--->0.1643461
	worry--->0.39462605
