<a href="https://colab.research.google.com/gist/absin1/b90f9eba8c0dec5d0e2391253df768ee/copy-of-transfer-learning-semantic-similarity-with-tf-hub-universal-encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


# Getting Started

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [2]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install seaborn



More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [4]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [5]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

In [6]:
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
embed_size

512

In [36]:
import psycopg2
import pandas.io.sql as sqlio
import numpy as np

def get_dataframe_sql():
    df = None
    sql = "select emotion as label, text_ as text from dataset_emotion_only"
    con = None
    try:
        con = psycopg2.connect("host='35.200.234.61' dbname='sales' user='postgres' password='cx6ac54nmgGtLD1y'")
        df = sqlio.read_sql_query(sql, con)
    except psycopg2.DatabaseError as e:
        if con:
            con.rollback()
        print(e)
        sys.exit(1)
    finally:
        if con:
            con.close()
    df = df.sample(frac=1.0)
    df.label = df.label.astype('category')
    return df
  
df = get_dataframe_sql()
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_test = df[~msk]
df_train.head()

Unnamed: 0,label,text
6312,relief,cause i wasn't there
11747,worry,i really hope my parents don't make me stay ho...
36812,happiness,that's part of what i've been working on...use...
2384,happiness,come hang out wif meeee
23631,worry,oooh... that's right by the zoo... think... in...


In [13]:
category_counts = len(df_train.label.cat.categories)
category_counts

14

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [14]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [16]:
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(category_counts, activation='sigmoid')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 14)                3598      
Total params: 134,926
Trainable params: 134,926
Non-trainable params: 0
_________________________________________________________________


In [18]:
train_text = df_train['text'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [19]:
train_text.shape

(31866, 1)

In [20]:
train_label.shape

(31866, 14)

In [21]:
train_label[:3]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int8)

In [22]:
test_text = df_test['text'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [44]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=5,
            batch_size=32)
    model.save_weights('./model.h5')

Train on 31866 samples, validate on 8134 samples
Epoch 1/5
   32/31866 [..............................] - ETA: 18:10 - loss: 0.6967 - acc: 0.4330

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fa034547860>>
Traceback (most recent call last):
  File "/home/absin/git/ai/venv/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')




Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fa0344ff6d8>>
Traceback (most recent call last):
  File "/home/absin/git/ai/venv/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
!ls -alh | grep model.h5

-rw-r--r--  1 absin absin 542K Aug  7 19:26 model.h5


## Make predictions

In [46]:
new_text = ["This is not exciting at all", "I don't like you at all",  "You forgot to mention just how smart the jews are, next they will make a deadly virus to kill ALL people BUT the jews!"]
new_text = np.array(new_text, dtype=object)[:, np.newaxis]
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights('./model.h5')  
    predicts = model.predict(new_text, batch_size=32)

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7f9fed581668>>
Traceback (most recent call last):
  File "/home/absin/git/ai/venv/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')


In [47]:
predicts

array([[2.9935241e-03, 2.6679993e-02, 3.1930655e-02, 1.1398077e-02,
        1.6420335e-02, 3.7594557e-02, 8.9221299e-02, 9.7981691e-03,
        1.1691126e-01, 1.3052076e-02, 1.4867631e-01, 6.0796738e-05,
        1.1503479e-01, 2.8995711e-01],
       [3.4394264e-03, 5.5623055e-04, 1.9120693e-02, 4.9242675e-03,
        2.2919476e-03, 3.5048723e-03, 2.4454299e-01, 1.1750087e-01,
        1.6075274e-01, 2.2898614e-03, 2.3797655e-01, 6.7353249e-06,
        1.7250627e-02, 1.2020707e-01],
       [4.9343407e-03, 2.3430586e-04, 2.6564270e-02, 8.7498128e-03,
        4.7906399e-02, 7.4281305e-02, 1.3324383e-01, 7.9410076e-03,
        1.5347946e-01, 1.7086059e-02, 2.3347706e-02, 3.1292439e-05,
        7.0502877e-02, 2.4828486e-01]], dtype=float32)

In [48]:
categories = df_train.label.cat.categories.tolist()
predict_logits = predicts.argmax(axis=1)
predict_labels = [categories[logit] for logit in predict_logits]
predict_labels

['worry', 'hate', 'worry']

In [52]:
threshold = 0.1
for i,sentence in enumerate(new_text):
    predict = predicts[i]
    print(sentence+'--->')
    for j, pred in enumerate(predict):
        if pred>threshold:
            print('\t'+categories[j]+'--->'+str(pred))

['This is not exciting at all--->']
	neutral--->0.11691126
	sadness--->0.1486763
	surprise--->0.11503479
	worry--->0.2899571
["I don't like you at all--->"]
	hate--->0.24454299
	love--->0.11750087
	neutral--->0.16075274
	sadness--->0.23797655
	worry--->0.12020707
['You forgot to mention just how smart the jews are, next they will make a deadly virus to kill ALL people BUT the jews!--->']
	hate--->0.13324383
	neutral--->0.15347946
	worry--->0.24828486


In [54]:
for predict in predicts:
    sum = 0
    for j, pred in enumerate(predict):
        sum += pred
    print(sum)

0.9097289443016052
0.9343648850917816
0.8165875226259232
