<a href="https://colab.research.google.com/gist/absin1/b90f9eba8c0dec5d0e2391253df768ee/copy-of-transfer-learning-semantic-similarity-with-tf-hub-universal-encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Keras + Universal Sentence Encoder = Transfer Learning for text data](https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/) Tutorial
## Universal Sentence Encoder

This notebook illustrates how to access the Universal Sentence Encoder and use it for sentence similarity and sentence classification tasks.

The Universal Sentence Encoder makes getting sentence level embeddings as easy as it has historically been to lookup the embeddings for individual words. The sentence embeddings can then be trivially used to compute sentence level meaning similarity as well as to enable better performance on downstream classification tasks using less supervised training data.


# Getting Started

This section sets up the environment for access to the Universal Sentence Encoder on TF Hub and provides examples of applying the encoder to words, sentences, and paragraphs.

In [None]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install seaborn

More detailed information about installing Tensorflow can be found at [https://www.tensorflow.org/install/](https://www.tensorflow.org/install/).

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

In [4]:
embed_size = embed.get_output_info_dict()['default'].get_shape()[1].value
embed_size

512

In [5]:
import psycopg2
import pandas.io.sql as sqlio
import numpy as np

def get_dataframe_sql():
    df = None
    sql = "select emotion as label, text_ as text from dataset_emotion_only"
    con = None
    try:
        con = psycopg2.connect("host='35.200.234.61' dbname='sales' user='postgres' password='cx6ac54nmgGtLD1y'")
        df = sqlio.read_sql_query(sql, con)
    except psycopg2.DatabaseError as e:
        if con:
            con.rollback()
        print(e)
        sys.exit(1)
    finally:
        if con:
            con.close()
    df = df.sample(frac=1.0)
    df.label = df.label.astype('category')
    return df
  
df = get_dataframe_sql()
msk = np.random.rand(len(df)) < 0.8
df_train = df[msk]
df_test = df[~msk]
df_train.head()

  """)


Unnamed: 0,label,text
26915,worry,yup our coke blades b annnd now i only need th...
37746,happiness,having a cup of tea i have a cold so it's tast...
15211,worry,sucks about your cat... hope you guys feel better
8556,surprise,wow their is no pancake mix
28397,neutral,hey there what's up?


In [6]:
category_counts = len(df_train.label.cat.categories)
category_counts

14

## Wrap embed module in a Lambda layer
Explicitly cast the input as a string

In [7]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [13]:
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(category_counts, activation='sigmoid')(dense)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 14)                3598      
Total params: 134,926
Trainable params: 134,926
Non-trainable params: 0
_________________________________________________________________


In [10]:
train_text = df_train['text'].tolist()
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

train_label = np.asarray(pd.get_dummies(df_train.label), dtype = np.int8)

In [11]:
train_text.shape

(31866, 1)

In [12]:
train_label.shape

(31866, 14)

In [13]:
train_label[:3]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int8)

In [14]:
test_text = df_test['text'].tolist()
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = np.asarray(pd.get_dummies(df_test.label), dtype = np.int8)

## Train Keras model and save weights
This only train and save our Keras layers not the embed module' weights.

In [17]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    history = model.fit(train_text, 
            train_label,
            validation_data=(test_text, test_label),
            epochs=1,
            batch_size=32)
    model.save_weights('../model.h5')

Train on 31866 samples, validate on 8134 samples
Epoch 1/1
   64/31866 [..............................] - ETA: 6:43 - loss: 0.6923 - acc: 0.5078 

Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fe17ae8eac8>>
Traceback (most recent call last):
  File "/home/andy/.local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')




Exception ignored in: <bound method BaseSession._Callable.__del__ of <tensorflow.python.client.session.BaseSession._Callable object at 0x7fe17b24ac50>>
Traceback (most recent call last):
  File "/home/andy/.local/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1473, in __del__
    self._session._session, self._handle)
tensorflow.python.framework.errors_impl.CancelledError: (None, None, 'Session has been closed.')




In [18]:
!ls -alh | grep model.h5

-rw-r--r-- 1 andy andy 542K Aug  8 12:50 model.h5


## Make predictions

In [22]:
session = tf.Session()

K.set_session(session)
session.run(tf.global_variables_initializer())
session.run(tf.tables_initializer())
model.load_weights('./model.h5')
    #predicts = model.predict(new_text, batch_size=32)

In [53]:
new_text = ["Ma chud gayi hai","This is not exciting at all", "I don't like you at all",  "You forgot to mention just how smart the jews are, next they will make a deadly virus to kill ALL people BUT the jews!"]
new_text = np.array(new_text, dtype=object)[:, np.newaxis]
predicts = model.predict(new_text, batch_size=32)

In [54]:
predicts

array([[1.94555521e-03, 2.32979655e-03, 2.25734413e-02, 1.48974359e-02,
        2.17680931e-02, 4.53250110e-02, 1.69878900e-02, 2.62952149e-02,
        3.95895898e-01, 8.30665231e-03, 5.09114563e-02, 1.37567520e-04,
        4.24551368e-02, 2.08949894e-01],
       [7.38626719e-03, 1.60430074e-02, 2.97409892e-02, 1.76334083e-02,
        2.13314891e-02, 6.19596243e-02, 1.04850560e-01, 1.70007944e-02,
        7.22412467e-02, 3.21382284e-02, 1.49075031e-01, 7.92920589e-04,
        9.80421603e-02, 2.89799392e-01],
       [4.46453691e-03, 5.53172827e-03, 2.15744674e-02, 1.10730231e-02,
        7.90318847e-03, 1.44430697e-02, 1.81470037e-01, 1.09797716e-01,
        1.33736938e-01, 8.15975666e-03, 2.01272994e-01, 2.54780054e-04,
        2.93026567e-02, 1.81697428e-01],
       [1.74095929e-02, 1.04933679e-02, 5.26147783e-02, 4.24410999e-02,
        8.89391899e-02, 7.04200864e-02, 9.77967083e-02, 2.25252807e-02,
        2.65968412e-01, 3.93825471e-02, 1.81784034e-02, 1.77690387e-03,
        9.075

In [47]:
categories = df_train.label.cat.categories.tolist()
predict_logits = predicts.argmax(axis=1)
predict_labels = [categories[logit] for logit in predict_logits]
predict_labels

['neutral', 'worry', 'sadness', 'worry']

In [48]:
threshold = 0.1
for i,sentence in enumerate(new_text):
    predict = predicts[i]
    print(sentence+'--->')
    for j, pred in enumerate(predict):
        if pred>threshold:
            print('\t'+categories[j]+'--->'+str(pred))

['Ma chud gayi hai--->']
	neutral--->0.3958959
	worry--->0.2089499
['This is not exciting at all--->']
	hate--->0.10485056
	sadness--->0.14907503
	worry--->0.2897994
["I don't like you at all--->"]
	hate--->0.18147004
	love--->0.109797716
	neutral--->0.13373694
	sadness--->0.201273
	worry--->0.18169743
['You forgot to mention just how smart the jews are, next they will make a deadly virus to kill ALL people BUT the jews!--->']
	neutral--->0.2659684
	worry--->0.2821408


In [54]:
for predict in predicts:
    sum = 0
    for j, pred in enumerate(predict):
        sum += pred
    print(sum)

0.9097289443016052
0.9343648850917816
0.8165875226259232


In [23]:
pwd

'/home/andy/git/ai/notebooks'

In [24]:
cd ..

/home/andy/git/ai


In [25]:
ls

AI-Flask.postman_collection.json  [0m[01;34mlib[0m/       [01;34mnotebooks[0m/  [01;34mstatic[0m/  wsgi.py
[01;34mbenchmark[0m/                        model.h5   setup.sh    [01;34mtext[0m/
constants.yaml                    [01;32mngrok.sh[0m*  [01;34mspeech[0m/     web.py


In [46]:
import yaml

In [47]:
data=yaml.load(open('constants.yaml'))

  """Entry point for launching an IPython kernel.


In [48]:
data['emotion_detection']['model_weights_path']

'./model.h5'

In [49]:
config=yaml.load(open('/home/andy/git/ai/constants.yaml'))

  """Entry point for launching an IPython kernel.


In [50]:
host=config['emotion']['db_host']

In [57]:
config['emotion']['threshold']

0.1