# _Deep Learning_
* Universal Sentence Embeddings in Action
    1. Load Up Dependencies
    2. Load and View the Dataset
    3. Building Train, Validation, and Test Datasets
    4. Basic Text Wrangling
    5. Build Data Ingestion Functions
    6. Build Deep Learning Model with Universal Sentence Encoder
    7. Model Training
    8. Model Evaluation
* Bonus: Transfer Learning with Different Universal Sentence Embeddingsm

# Universal Sentence Embeddings in Action

## Load Up Dependencies

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd

# check if tensorflow will be using a GPU
tf.test.is_gpu_available()

True

In [6]:
tf.test.gpu_device_name()

'/device:GPU:0'

## Load and View the Dataset

In [7]:
# for google colab - get data url
url = 'https://raw.githubusercontent.com/beliciataylor/LearningCode/master/NLP_Learning/ch10_deep_learning/data/movie_reviews.csv'

# load dataset
dataset = pd.read_csv(url)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
# encode sentiment columns (label encoding)
dataset['sentiment'] = [1 if sentiment == 'positive' else 0 for sentiment in dataset['sentiment'].values]
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Building Train, Validation, and Test Datasets

In [9]:
# could also use train_test_split() from sklearn
# dividing dataset
reviews = dataset['review'].values
sentiments = dataset['sentiment'].values

train_reviews = reviews[:30000]
train_sentiments = sentiments[:30000]

val_reviews = reviews[30000:35000]
val_sentiments = sentiments[30000:35000]

test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

train_reviews.shape, val_reviews.shape, test_reviews.shape

((30000,), (5000,), (15000,))

## Basic Text Wrangling

In [0]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [0]:
# preprocess dataset
train_reviews = pre_process_corpus(train_reviews)
val_reviews = pre_process_corpus(val_reviews)
test_reviews = pre_process_corpus(test_reviews)

## Build Data Ingestion Functions

In [0]:
# define functions to build data and feature engineering pipelines
# enable data flowing into our model during training

# training inputon the whole training set with no limit on training epochs
train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn({'sentence': train_reviews}, train_sentiments, 
                                                    batch_size=256, num_epochs=None, shuffle=True)

# prediction on the whole training set
predict_train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn({'sentence': train_reviews}, train_sentiments, shuffle=False)

# prediction on the whole validation set
predict_val_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn({'sentence': val_reviews}, val_sentiments, shuffle=False)

# prediction on the test set
predict_test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn({'sentence': test_reviews}, test_sentiments, shuffle=False)

## Build Deep Learning Model with Universal Sentence Encoder

In [0]:
# define sentence-embedding feature that leverages universal sentence encoder
embedding_feature = hub.text_embedding_column(key='sentence', 
                                              module_spec='https://tfhub.dev/google/universal-sentence-encoder/2', 
                                              trainable=False)

In [30]:
dnn = tf.estimator.DNNClassifier(hidden_units=[512, 128], feature_columns=[embedding_feature], n_classes=2, 
                                 activation_fn=tf.nn.relu, dropout=0.1, optimizer=tf.compat.v1.train.AdagradOptimizer(learning_rate=0.005))

# train for approximately 12 epochs
256*1500 / 30000 == 12.8

True

## Model Training

In [31]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import time

TOTAL_STEPS = 1500
STEP_SIZE = 100
for step in range(0, TOTAL_STEPS + 1, STEP_SIZE):
  print()
  print('-'*100)
  print('Training for step =', step)
  start_time = time.time()
  dnn.train(input_fn=train_input_fn, steps=STEP_SIZE)
  elapsed_time = time.time() - start_time
  print('Train Time (s):', elapsed_time)
  print('Eval Metrics (Train):', dnn.evaluate(input_fn=predict_train_input_fn))
  print('Eval Metrics (Validation):', dnn.evaluate(input_fn=predict_val_input_fn))


----------------------------------------------------------------------------------------------------
Training for step = 0


ValueError: ignored