In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
import tensorflow as tf

In [2]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3050 Laptop GPU (UUID: GPU-568f2418-97a2-eb4a-4350-001120466be8)


# Get Helper Functions

In [3]:
# Download helper functions script
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2022-01-12 16:13:59--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: 'helper_functions.py.1'

     0K ..........                                            100% 1.07M=0.009s

2022-01-12 16:14:00 (1.07 MB/s) - 'helper_functions.py.1' saved [10246/10246]



In [4]:
# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

# Get a Text Dataset

Let's start by downloading a text dataset. We'll be using the Real or Not? datset from Kaggle which contains text-based Tweets about natural disasters.

The Real Tweets are actually about diasters, for example:

- Jetstar and Virgin forced to cancel Bali flights again because of ash from Mount Raung volcano
- The Not Real Tweets are Tweets not about diasters (they can be on anything), for example: 'Education is the most powerful weapon which you can use to change the world.' Nelson #Mandela #quote

In [5]:
# download the dataset from kaggle
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2022-01-12 16:14:00--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 2404:6800:4009:821::2010, 2404:6800:4009:81b::2010, 2404:6800:4009:81a::2010, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4009:821::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: 'nlp_getting_started.zip.1'

     0K .......... .......... .......... .......... ..........  8% 1.34M 0s
    50K .......... .......... .......... .......... .......... 16% 2.55M 0s
   100K .......... .......... .......... .......... .......... 25% 5.32M 0s
   150K .......... .......... .......... .......... .......... 33% 5.74M 0s
   200K .......... .......... .......... .......... .......... 42% 7.37M 0s
   250K .......... .......... .......... .......... .......... 50% 6.71M 0s
   300K .......... .......... .......... .......... .......... 59% 6.55M 0

In [6]:
# Unzip Data
unzip_data("nlp_getting_started.zip")

Unzipping nlp_getting_started.zip gives the following 3 .csv files:

- sample_submission.csv - an example of the file you'd submit to the    Kaggle competition of your model's predictions.
- train.csv - training samples of real and not real diaster Tweets.
- test.csv - testing samples of real and not real diaster Tweets.

# Visualizing A Text Dataset

In [7]:
# Turn CSV files into dataframe
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [8]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_df["text"][1]

'Forest fire near La Ronge Sask. Canada'

In [10]:
# Shuffling the Training DataFrame
train_df_shuffled = train_df.sample(frac=1,
                                  random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [11]:
# What does test dataframe look like
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
# how many classes are there in the target 
print(train_df.target.value_counts())
print('\n')
print(train_df.target.value_counts(normalize=True))

0    4342
1    3271
Name: target, dtype: int64


0    0.57034
1    0.42966
Name: target, dtype: float64


Since we have two target values, we're dealing with a binary classification problem.

It's fairly balanced too, about 60% negative class (target = 0) and 40% positive class (target = 1).

Where,
- 1 = a real disaster Tweet
- 0 = not a real disaster Tweet

In [13]:
# how many total samples
print(len(train_df))
print(len(test_df))

7613
3263


In [14]:
# Let's visualize some random training examples
import random

random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "(real disaster)" if target>0 else "(not real disaster)")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 1 (real disaster)
Text:
@OriginalFunko @Spencers THUNDER BUDDYS!!!! THUNDER BUDDYS!!!!

---

Target: 1 (real disaster)
Text:
3 Former Executives to Be Prosecuted in Fukushima Nuclear Disaster http://t.co/JSsmMLNaQ7

---

Target: 1 (real disaster)
Text:
The 08/06/2015 AlabamaQuake seismic summary w/ #earthquake #news &amp; history http://t.co/zM6VcZqvWk http://t.co/DKNlZNom6n

---

Target: 0 (not real disaster)
Text:
that exploded &amp; brought about the
beginning of universe matches what's
mentioned in the versethe heaven and Earth
(thus the universe)

---

Target: 1 (real disaster)
Text:
FedEx no longer to transport bioterror germs in wake of anthrax lab mishaps http://t.co/MqbYrAvK6h

---



# Split the Dataset into Training Dataset and Test Dataset

To split our training dataset and create a validation dataset, we'll use Scikit-Learn's **train_test_split()** method and dedicate 10% of the training samples to the validation set.

In [15]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                           train_df_shuffled["target"].to_numpy(),
                                                                           test_size=0.1, # use 10% training dataset for validation,
                                                                           random_state=42)

In [16]:
# Check the lenghts 
print(len(train_sentences))
print(len(train_labels))
print(len(val_sentences))
print(len(val_labels))

6851
6851
762
762


In [17]:
# check the first 10 exaples
print(train_sentences[:10])
print('\n')
print(train_labels[:10])

['@mogacola @zamtriossu i screamed after hitting tweet'
 'Imagine getting flattened by Kurt Zouma'
 '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....'
 "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet"
 'Somehow find you and I collide http://t.co/Ee8RpOahPk'
 '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao'
 'destroy the free fandom honestly'
 'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE'
 '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.'
 'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt']


[0 0 1 0 0 1 1 0 1 1]


# NLP Data Pre-processing

### Converting Text into Numbers

In NLP, there are two main concepts for turning text into numbers:

- **Tokenization** - A straight mapping from word or character or sub-word to a numerical value. There are three main levels of tokenization:
  1. Using **word-level tokenization** with the sentence "I love TensorFlow" might result in "I" being 0, "love" being 1 and "TensorFlow" being 2. In this case, every word in a sequence considered a single **token**.
  2. **Character-level tokenization**, such as converting the letters A-Z to values 1-26. In this case, every character in a sequence considered a single **token**.
  3. **Sub-word tokenization** is in between word-level and character-level tokenization. It involves breaking invidual words into smaller parts and then converting those smaller parts into numbers. For example, "my favourite food is pineapple pizza" might become "my, fav, avour, rite, fo, oo, od, is, pin, ine, app, le, piz, za". After doing this, these sub-words would then be mapped to a numerical value. In this case, every word could be considered multiple **tokens**.


- **Embeddings** - An embedding is a representation of natural language which can be learned. Representation comes in the form of a **feature vector**. For example, the word "dance" could be represented by the 5-dimensional vector [-0.8547, 0.4559, -0.3332, 0.9877, 0.1112]. It's important to note here, the size of the feature vector is tuneable. There are two ways to use embeddings:
  
  1.**Create your own embedding** - Once your text has been turned into numbers (required for an embedding), you can put them through an embedding layer (such as tf.keras.layers.Embedding) and an embedding representation will be learned during model training.
  
  2.**Reuse a pre-learned embedding** - Many pre-trained embeddings exist online. These pre-trained embeddings have often been learned on large corpuses of text (such as all of Wikipedia) and thus have a good underlying representation of natural language. You can use a pre-trained embedding to initialize your model and fine-tune it to your own specific task.

### Text vectorization (tokenization)

Enough talking about tokenization and embeddings, let's create some.

We'll practice tokenzation (mapping our words to numbers) first.

To tokenize our words, we'll use the helpful preprocessing layer tf.keras.layers.experimental.preprocessing.TextVectorization.

The TextVectorization layer takes the following parameters:

- max_tokens - The maximum number of words in your vocabulary (e.g. 20000 or the number of unique words in your text), includes a value for OOV (out of vocabulary) tokens.
- standardize - Method for standardizing text. Default is "lower_and_strip_punctuation" which lowers text and removes all punctuation marks.
- split - How to split text, default is "whitespace" which splits on spaces.
- ngrams - How many words to contain per token split, for example, ngrams=2 splits tokens into continuous sequences of 2.
- output_mode - How to output tokens, can be "int" (integer mapping), "binary" (one-hot encoding), "count" or "tf-idf". See documentation for more.
- output_sequence_length - Length of tokenized sequence to output. For example, if output_sequence_length=150, all tokenized sequences will be 150 tokens long.
- pad_to_max_tokens - Defaults to False, if True, the output feature axis will be padded to max_tokens even if the number of unique tokens in the vocabulary is less than max_tokens. Only valid in certain modes, see docs for more.


In [18]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 

# use the default TextVectorization parameters
text_vectorization = TextVectorization(max_tokens = None, # how many words in the vocabulary (automatically add <OOV>)
                                       standardize = "lower_and_strip_punctuation",
                                       split="whitespace",
                                       ngrams= None, # create groups of n-words
                                       output_mode = "int", # how to map tokens to numbers
                                       output_sequence_length = None, # how long do you want your sequences to be
                                       # pad_to_max_tokens = True #not valid if using max_tokens = None  
                                      )



We've initialized a TextVectorization object with the default settings but let's customize it a little bit for our own use case.

In particular, let's set values for max_tokens and output_sequence_length.

For max_tokens (the number of words in the vocabulary), multiples of 10,000 (10,000, 20,000, 30,000) or the exact number of unique words in your text (e.g. 32,179) are common values.

For our use case, we'll use 10,000.

And for the output_sequence_length we'll use the average number of tokens per Tweet in the training set. But first, we'll need to find it.


In [19]:
# find the average number of tokens(words) in the training tweet
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [20]:
train_sentences[0].split()

['@mogacola', '@zamtriossu', 'i', 'screamed', 'after', 'hitting', 'tweet']

In [21]:
# set text vectorization variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length of our sequences will be (e.g. how many words from a tweet does a model see)


# reinstanciate the textvectorization object
text_vectorizer = TextVectorization(max_tokens = max_vocab_length,
                                   output_mode = "int",
                                   output_sequence_length = max_length)



Beautiful!

To map our TextVectorization instance text_vectorizer to our data, we can call the adapt() method on it whilst passing it our training text.


In [22]:
# Fit the text vectorizer to training data text
text_vectorizer.adapt(train_sentences)

Training data mapped! Let's try our text_vectorizer on a custom sentence (one similar to what you might see in the training data).

In [23]:
# create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

Wonderful, it seems we've got a way to turn our text into numbers (in this case, word-level tokenization). Notice the 0's at the end of the returned tensor, this is because we set output_sequence_length=15, meaning no matter the size of the sequence we pass to text_vectorizer, it always returns a sequence with a length of 15.

How about we try our text_vectorizer on a few random sentences?

In [24]:
import random

# choose a random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\
\n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
 If you're lost and alone or you're sinking like a stone carry onnnn

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  47,  172,  681,    7,  910,   53,  172,  253,   25,    3, 1756,
        2163,    1,    0,    0]], dtype=int64)>

Finally, we can check the unique tokens in our vocabulary using the get_vocabulary() method.

In [25]:
# get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

print(f"Number of words in Vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in Vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [26]:
# [UNK] stands for unknown. it represent the OOV vocabs in our dataset

# Creating an Embedding using an Embedding Layer

We've got a way to map our text to numbers. How about we go a step further and turn those numbers into an embedding?

The powerful thing about an embedding is it can be learned during training. This means rather than just being static (e.g. 1 = I, 2 = love, 3 = TensorFlow), a word's numeric representation can be improved as a model goes through data samples.

We can see what an embedding of a word looks like by using the tf.keras.layers.Embedding layer.

The main parameters we're concerned about here are:

- input_dim - The size of the vocabulary (e.g. len(text_vectorizer.get_vocabulary()).
- output_dim - The size of the output embedding vector, for example, a value of 100 outputs a feature vector of size 100 for each word.
- embeddings_initializer - How to initialize the embeddings matrix, default is "uniform" which randomly initalizes embedding matrix with uniform distribution. This can be changed for using pre-learned embeddings.
- input_length - Length of sequences being passed to embedding layer.

Now, lets create an embedding layer

In [27]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length, # set input shape
                             output_dim = 128, # set size of embedding vector
                             embeddings_initializer = "uniform", # default, initialize randomly
                             input_length = max_length, # how long is each input
                             name = "embedding_1")

embedding

<keras.layers.embeddings.Embedding at 0x140236b6308>

Excellent, notice how **embedding** is a TensoFlow layer? This is important because we can use it as part of a model, meaning its parameters (word representations) can be updated and improved as the model learns.

How about we try it out on a sample sentence?

**NOTE** : we cant pass a sentence directly to the embedding layer. the input must be numerical in form.

In [28]:
# get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text: \n {random_sentence}\
        \n Embedded version:")

# Embed the random sentence (turns an index of numbers into a dense vector of fixed size)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text: 
 A tin of Tesco dog food 'exploded' and prompted THIS complaint - via @chelsea_dogs #pets #dogs #animals #puppy http://t.co/QzvKPaHsQ7        
 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04284013, -0.01489798, -0.0159496 , ..., -0.01166106,
          0.03061062,  0.01972148],
        [ 0.04461361,  0.02408149,  0.03878819, ..., -0.02968085,
          0.00867941, -0.03825009],
        [ 0.00257028, -0.04243337, -0.02343434, ..., -0.00133356,
          0.04663134,  0.00035417],
        ...,
        [ 0.03977952, -0.03782602, -0.03646283, ...,  0.00236253,
          0.03332629,  0.02803668],
        [-0.01694556, -0.04553232, -0.01517646, ...,  0.02216286,
         -0.03468401, -0.00575745],
        [-0.02813115,  0.00625932, -0.04300672, ...,  0.02858392,
          0.02696348, -0.04848478]]], dtype=float32)>

**NOTE**: we can see the shape of the above embed output to be (1, 15, 128). The '1' represents the 1 sentence passes to the layer. The 15 represents the output_sequence_lenght of 15 that we defined earlier (Our sentence is broken down into 15 tokens). The '128' represents the embedding for each of the 15 tokens which is 128 in length (i.e. each token gets converted into a vector of 128 length).

In [29]:
# Check Out a Single token's embedding
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.04284013, -0.01489798, -0.0159496 , -0.0226305 ,  0.04298959,
        -0.04682324, -0.0026353 ,  0.01123267, -0.03430966, -0.00190909,
         0.02867594,  0.0297017 ,  0.02498296,  0.00814937,  0.04493314,
         0.04413916, -0.00577633,  0.03141482,  0.00966071, -0.04037346,
         0.03765199, -0.01732815, -0.02747819, -0.02993454, -0.02981216,
         0.0308927 , -0.02260027, -0.00124929,  0.01732543, -0.02180376,
        -0.03130232, -0.04009864,  0.03664006, -0.01028627, -0.03222132,
         0.00378202, -0.02535181, -0.00505129,  0.02522682, -0.01333591,
         0.0391151 , -0.00091956,  0.02860123, -0.04375963,  0.01296742,
         0.0263852 , -0.04896233, -0.04747603,  0.04653648,  0.01485529,
        -0.04613405,  0.00209745, -0.00271541,  0.03082445,  0.04200928,
        -0.04887832, -0.04972835, -0.0254328 ,  0.03892423, -0.02046248,
        -0.0439718 , -0.0345499 , -0.0287706 ,  0.03040506,  0.03975679,
  

# Modelling a Text Dataset

Once you've got your inputs and outputs prepared, it's a matter of figuring out which machine learning model to build in between them to bridge the gap.

Now that we've got a way to turn our text data into numbers, we can start to build machine learning models to model it.

To get plenty of practice, we're going to build a series of different models, each as its own experiment. We'll then compare the results of each model and see which one performed best.

More specifically, we'll be building the following:

   - Model 0: Naive Bayes (baseline)
   - Model 1: Feed-forward neural network (dense model)
   - Model 2: LSTM model (RNN)
   - Model 3: GRU model (RNN)
   - Model 4: Bidirectional-LSTM model (RNN)
   - Model 5: 1D Convolutional Neural Network (CNN)
   - Model 6: TensorFlow Hub Pretrained Feature Extractor (transfer learning for NLP)
   - Model 7: Same as model 6 with 10% of training data

Model 0 is the simplest to acquire a baseline which we'll expect each other of the other deeper models to beat.

Each experiment will go through the following steps:

   - Construct the model
   - Train the model
   - Make predictions with the model
   - Track prediction evaluation metrics for later comparison

# 1. Model 0 : Naive Bayes Baseline Model (with TF_IDF)

To create our baseline, we'll create a Scikit-Learn Pipeline using the TF-IDF (term frequency-inverse document frequency) formula to convert our words to numbers and then model them with the Multinomial Naive Bayes algorithm.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling Pipeline
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()), # convert words to number using tfidf
    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [31]:
# Evaluate our baseline Model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [32]:
# Make predictions using our model
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:10]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int64)

#### Creating an evaluation Function for our Evaluation Metrics (accuracy, precision, recall, F1-score)

In [33]:
def calculate_results(y_true, y_pred):
    """calculates the model accuracy, precision, recall and f1 score of a binary classification model
    Args:
    -----
    y_true = true labels in the form of a 1D array
    y_pred = predicted labels in the form of 1D array
    
    Returns a dictionary of accuracy, precision, recall and f1 score
    """
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    
    # calculate the model accuracy
    model_accuracy = accuracy_score(y_true, y_pred)*100
    
    # calculate model precision, recall and f1 score using "weighted" average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true,
                                                                                   y_pred,
                                                                                  average="weighted")
    model_results = {"accuracy": model_accuracy,
                    "precision": model_precision,
                    "recall": model_recall,
                    "f1": model_f1}
    
    return model_results

In [34]:
# get the evaluation metrics for baseline model
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [35]:
# Create a function to Compare 2 model performances


# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
    for key, value in baseline_results.items():
        print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

# 2. Model 1: Feed Forward Neural Network (Simple Dense Model)

The first "deep" model we're going to build is a single layer dense model. In fact, it's barely going to have a single layer.

It'll take our text and labels as input, tokenize the text, create an embedding, find the average of the embedding (using Global Average Pooling) and then pass the average through a fully connected layer with one output unit and a sigmoid activation function.

If the previous sentence sounds like a mouthful, it'll make sense when we code it out (remember, if in doubt, code it out).

And since we're going to be building a number of TensorFlow deep learning models, we'll import our create_tensorboard_callback() function from helper_functions.py to keep track of the results of each.

In [36]:
# create a tensorboard callback (need to create a new one for each model)
from helper_functions import create_tensorboard_callback

# create directory to save Tensorboard logs
SAVE_DIR = "model_log"

In [37]:
# Building a Dense Model with Functional API

from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype=tf.string) # inputs are 1-D strings
x = text_vectorizer(inputs) # turn the inputs text into numbers
x = embedding(x) # create an embedding of the numerical inputs
x = layers.GlobalAveragePooling1D()(x) # lower the dimensonality of the embedding
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid
model_1 = tf.keras.Model(inputs, outputs, name = "model_1_dense") # construct the model

Looking good. Our model takes a 1-dimensional string as input (in our case, a Tweet), it then tokenizes the string using text_vectorizer and creates an embedding using embedding.

We then (optionally) pool the outputs of the embedding layer to reduce the dimensionality of the tensor we pass to the output layer

Finally, we pass the output of the pooling layer to a dense layer with sigmoid activation (we use sigmoid since our problem is binary classification).

Before we can fit our model to the data, we've got to compile it. Since we're working with binary classification, we'll use "binary_crossentropy" as our loss function and the Adam optimizer.



Most of the trainable parameters are contained within the embedding layer. Recall we created an embedding of size 128 (output_dim=128) for a vocabulary of size 10,000 (input_dim=10000), hence the 1,280,000 trainable parameters.

Alright, our model is compiled, let's fit it to our training data for 5 epochs. We'll also pass our TensorBoard callback function to make sure our model's training metrics are logged.


In [38]:
# get the model summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [39]:
# compile the model
model_1.compile(loss = "binary_crossentropy",
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ["accuracy"])

In [40]:
# fit the model
model_1.history = model_1.fit(train_sentences, # input sentences can be a list of tuples
                             train_labels,
                             epochs = 5,
                             validation_data = (val_sentences, val_labels),
                             callbacks = [create_tensorboard_callback(dir_name = SAVE_DIR,
                                                                     experiment_name = "model_1_dense")])

Saving TensorBoard log files to: model_log/model_1_dense/20220112-161406
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
# check the results
model_1.evaluate(val_sentences, val_labels)



[0.4766846001148224, 0.787401556968689]

In [42]:
# make some predictions and evaluate those
model_1_pred_probs = model_1.predict(val_sentences)

print(model_1_pred_probs.shape)

(762, 1)


In [43]:
# look at the first 10 predictions
model_1_pred_probs[:10]

array([[0.40488204],
       [0.7443312 ],
       [0.997895  ],
       [0.10889999],
       [0.11143529],
       [0.93556094],
       [0.9134595 ],
       [0.9925345 ],
       [0.97156817],
       [0.2657034 ]], dtype=float32)

In [44]:
# convert model prediction probabilities to label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds[:20]

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [45]:
# evaluate the model_1 performance metrics
model_1_results = calculate_results(y_true = val_labels,
                                   y_pred= model_1_preds)

model_1_results

{'accuracy': 78.74015748031496,
 'precision': 0.7914920592553047,
 'recall': 0.7874015748031497,
 'f1': 0.7846966492209201}

In [46]:
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [47]:
compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

Baseline accuracy: 79.27, New accuracy: 78.74, Difference: -0.52
Baseline precision: 0.81, New precision: 0.79, Difference: -0.02
Baseline recall: 0.79, New recall: 0.79, Difference: -0.01
Baseline f1: 0.79, New f1: 0.78, Difference: -0.00


In [48]:
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

we can see that our dense model is actually performing worse than our baseline Naive Bayes Model

### Visulizing Model's Learned Embeddings with Tensorflow's Projector Tool

In [49]:
# Get the vocabulary from the text vectorization layers
words_in_vocab = text_vectorizer.get_vocabulary()
print(words_in_vocab)
print('\n')
print(len(words_in_vocab))
print("\n")
print(words_in_vocab[:10])



10000


['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is']


In [50]:
# model summary
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

Get the weight matrix of our embedding layer
these are the numerical representations of each token in our training data, which have been learned for 5 epochs

In [51]:
embedding.weights

[<tf.Variable 'embedding_1/embeddings:0' shape=(10000, 128) dtype=float32, numpy=
 array([[ 0.00073164,  0.01504801, -0.03425453, ..., -0.04403543,
         -0.01042281,  0.01876438],
        [ 0.04135863, -0.03945085, -0.03811941, ...,  0.00464735,
          0.03163552,  0.02928301],
        [ 0.00684031,  0.05363135, -0.00241554, ..., -0.07082177,
         -0.04750703,  0.01448254],
        ...,
        [-0.03301444, -0.0052493 , -0.04209725, ...,  0.02028764,
          0.00308807,  0.02215792],
        [ 0.00692343,  0.05942352, -0.01975194, ..., -0.06199061,
         -0.01018393,  0.03510419],
        [-0.0372346 ,  0.06267187, -0.07451147, ..., -0.02367218,
         -0.08643329,  0.01742155]], dtype=float32)>]

In [52]:
# check the shapes of the first embedding

embed_weights = model_1.get_layer("embedding_1").get_weights()[0]
embed_weights.shape # same size as the vocab size and embedding dim(output_dim of our embedding layer)

(10000, 128)

The (10000, 128) above means that every token above is represented by a vector of 128 dims.

In [53]:
# create embedding files (we got this from tensorflow word embedding documentation)
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = embed_weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

nce you've downloaded the embedding vectors and metadata, you can visualize them using Embedding Vector tool:
- Go to http://projector.tensorflow.org/
- Click on "Load data"
- Upload the two files you downloaded (embedding_vectors.tsv and embedding_metadata.tsv)
- Explore Optional: You can share the data you've created by clicking "Publish"

The embeddings we downloaded are how our model interprets words, not necessarily how we interpret them.

Also, since the embedding has been learned purely from Tweets, it may contain some strange values as Tweets are a very unique style of natural language.

# Model 2 : LSTM- Long Short Term Memory (RNN)

We're going to start with an LSTM-powered RNN.

To harness the power of the LSTM cell (LSTM cell and LSTM layer are often used interchangably) in TensorFlow, we'll use tensorflow.keras.layers.LSTM().

Our model is going to take on a very similar structure to model_1:

Input (text) -> Tokenize -> Embedding -> Layers -> Output (label probability)

The main difference will be that we're going to add an LSTM layer between our embedding and output.

And to make sure we're not getting reusing trained embeddings (this would involve data leakage between models, leading to an uneven comparison later on), we'll create another embedding layer (model_2_embedding) for our model. The text_vectorizer layer can be reused since it doesn't get updated during training.

🔑 Note: The reason we use a new embedding layer for each model is since the embedding layer is a learned representation of words (as numbers), if we were to use the same embedding layer (embedding_1) for each model, we'd be mixing what one model learned with the next. And because we want to compare our models later on, starting them with their own embedding layer each time is a better idea.

In [54]:
# Set a random seed and create new embedding layer for this model
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim = max_vocab_length,
                                    output_dim = 128,
                                    embeddings_initializer = "uniform",
                                    input_length = max_length,
                                    name = "embedding_2")

In [55]:
# create LSTM Model
inputs = layers.Input(shape=(1,), dtype = "string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)

#print(x.shape)
# x = layers.LSTM(return_sequences = True) (x) # when we are stacking RNN cells together
# print(x.shape)

x = layers.LSTM(64)(x)
# print(x.shape)
# x = layers.Dense(64, activation="relu")(x)
# print(x.shape)

outputs = layers.Dense(1, activation = "sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name = "model_2_LSTM")

In [56]:
# get the model summary
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [57]:
# compile the model
model_2.compile(loss = "binary_crossentropy",
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ["accuracy"])

In [58]:
# Fit the model
model_2.fit(train_sentences, 
           train_labels,
           epochs = 5,
           validation_data = (val_sentences, val_labels),
           callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                   "model_2_LSTM")])

Saving TensorBoard log files to: model_log/model_2_LSTM/20220112-161418
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14032aca2c8>

In [59]:
# Make predictins with LSTM Model
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

array([[0.00713337],
       [0.7874562 ],
       [0.9996377 ],
       [0.05690471],
       [0.00258272],
       [0.9996238 ],
       [0.9216488 ],
       [0.9997993 ],
       [0.9994956 ],
       [0.66466784]], dtype=float32)

In [60]:
# Convert Model 2 pred probs to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [61]:
# estimate performance metrics
model_2_results = calculate_results(y_true = val_labels,
                                   y_pred = model_2_preds)

model_2_results

{'accuracy': 75.06561679790026,
 'precision': 0.7510077975908164,
 'recall': 0.7506561679790026,
 'f1': 0.7489268622514025}

In [62]:
# Compare these results with baseline results
compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_2_results)

Baseline accuracy: 79.27, New accuracy: 75.07, Difference: -4.20
Baseline precision: 0.81, New precision: 0.75, Difference: -0.06
Baseline recall: 0.79, New recall: 0.75, Difference: -0.04
Baseline f1: 0.79, New f1: 0.75, Difference: -0.04


# Model 3: GRU (RNN)

Another popular and effective RNN component is the GRU or gated recurrent unit.

The GRU cell has similar features to an LSTM cell but has less parameters.

To use the GRU cell in TensorFlow, we can call the tensorflow.keras.layers.GRU() class.

The architecture of the GRU-powered model will follow the same structure we've been using:

Input (text) -> Tokenize -> Embedding -> Layers -> Output (label probability)

Again, the only difference will be the layer(s) we use between the embedding and the output.

In [63]:
# create a new embedding for model 3
tf.random.set_seed(42)

from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim = max_vocab_length,
                                    output_dim = 128,
                                    embeddings_initializer = "uniform",
                                    input_length = max_length,
                                    name = "embedding_3")

In [64]:
# Building an RNN using GRU cell

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)

# x = layers.GRU(64, return_sequences=True)
# x = layers.LSTM(42, return_sequences = True)
# x = layers.GRU(99)
# x = layers.Dense(64, activation="relu")(x)

x = layers.GRU(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [65]:
# compile the model
model_3.compile(loss = "binary_crossentropy",
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ["accuracy"])

In [66]:
# Get a summary of the GRU model
model_3.summary()

Model: "model_3_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________



Notice the difference in number of trainable parameters between model_2 (LSTM) and model_3 (GRU). The difference comes from the LSTM cell having more trainable parameters than the GRU cell.

We'll fit our model just as we've been doing previously. We'll also track our models results using our create_tensorboard_callback() function.


In [67]:
# Fit the Model
model_3_history = model_3.fit(train_sentences, 
                             train_labels,
                             epochs = 5,
                             validation_data = (val_sentences, val_labels),
                             callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                     experiment_name = "model_3_GRU")])

Saving TensorBoard log files to: model_log/model_3_GRU/20220112-161435
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [68]:
# Make predictions on the validation data
model_3_pred_probs = model_3.predict(val_sentences)
print(model_3_pred_probs.shape)
model_3_pred_probs[:10]

(762, 1)


array([[0.3333033 ],
       [0.8774384 ],
       [0.9980247 ],
       [0.11559197],
       [0.01236264],
       [0.9925619 ],
       [0.6213533 ],
       [0.9981324 ],
       [0.998237  ],
       [0.5017956 ]], dtype=float32)

In [69]:
# convert probabilities to prediction classes
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [70]:
# calculate model3 results
model_3_results = calculate_results(y_true = val_labels,
                                   y_pred = model_3_preds)

model_3_results

{'accuracy': 76.77165354330708,
 'precision': 0.7675450859410361,
 'recall': 0.7677165354330708,
 'f1': 0.7667932666650168}

In [71]:
# compare to the baseline
compare_baseline_to_new_results (baseline_results,
                                model_3_results)

Baseline accuracy: 79.27, New accuracy: 76.77, Difference: -2.49
Baseline precision: 0.81, New precision: 0.77, Difference: -0.04
Baseline recall: 0.79, New recall: 0.77, Difference: -0.02
Baseline f1: 0.79, New f1: 0.77, Difference: -0.02


# Model 4 : Bidirectional RNN

Look at us go! We've already built two RNN's with GRU and LSTM cells. Now we're going to look into another kind of RNN, the bidirectional RNN.

A standard RNN will process a sequence from left to right, where as a bidirectional RNN will process the sequence from left to right and then again from right to left.

Intuitively, this can be thought of as if you were reading a sentence for the first time in the normal fashion (left to right) but for some reason it didn't make sense so you traverse back through the words and go back over them again (right to left).

In practice, many sequence models often see and improvement in performance when using bidirectional RNN's.

However, this improvement in performance often comes at the cost of longer training times and increased model parameters (since the model goes left to right and right to left, the number of trainable parameters doubles).

Okay enough talk, let's build a bidirectional RNN.

Once again, TensorFlow helps us out by providing the tensorflow.keras.layers.Bidirectional class. We can use the Bidirectional class to wrap our existing RNNs, instantly making them bidirectional.

In [72]:
# create embedding layer for model 4
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim = max_vocab_length,
                                    output_dim = 128,
                                    embeddings_initializer = "uniform",
                                    input_length = max_length,
                                    name = "embedding_4")

In [73]:
# Build Bi-directional RNN Model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)

# x = layers.Bidirectional(layers.LSTM(64, return_sequences = True)) # when we want to stack RNN layers on top of each other

x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="Model_4_Bidirectional")

In [74]:
# get the model summary
model_4.summary()

Model: "Model_4_Bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,3

In [75]:
# compile the model
model_4.compile(loss = "binary_crossentropy",
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ["accuracy"])

In [76]:
# Fit the Model
model_4_history= model_4.fit(train_sentences, 
           train_labels,
           epochs = 5,
           validation_data = (val_sentences, val_labels),
           callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                   experiment_name = "model_4_bidirectional_RNN")])

Saving TensorBoard log files to: model_log/model_4_bidirectional_RNN/20220112-161451
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [77]:
# make predictions with Bidirectional Model
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]

array([[0.03995152],
       [0.82796067],
       [0.99842024],
       [0.13519038],
       [0.00310786],
       [0.9921989 ],
       [0.95532155],
       [0.9994561 ],
       [0.9989818 ],
       [0.28122008]], dtype=float32)

In [78]:
# convert pred probs to pred labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [79]:
# Calculate the results of the Bidirectional Model
model_4_results = calculate_results(y_true = val_labels,
                                   y_pred = model_4_preds)
model_4_results

{'accuracy': 76.64041994750657,
 'precision': 0.7665895370389821,
 'recall': 0.7664041994750657,
 'f1': 0.7651213533864446}

In [80]:
# Check to see how the bidirectional model performs against the baseline
compare_baseline_to_new_results(baseline_results, model_4_results)

Baseline accuracy: 79.27, New accuracy: 76.64, Difference: -2.62
Baseline precision: 0.81, New precision: 0.77, Difference: -0.04
Baseline recall: 0.79, New recall: 0.77, Difference: -0.03
Baseline f1: 0.79, New f1: 0.77, Difference: -0.02


# Model 5 : Conv1D for Text Classification

You might've used convolutional neural networks (CNNs) for images before but they can also be used for sequences.

The main difference between using CNNs for images and sequences is the shape of the data. Images come in 2-dimensions (height x width) where as sequences are often 1-dimensional (a string of text).

So to use CNNs with sequences, we use a 1-dimensional convolution instead of a 2-dimensional convolution.

A typical CNN architecture for sequences will look like the following:

Inputs (text) -> Tokenization -> Embedding -> Layers -> Outputs (class probabilities)

You might be thinking "that just looks like the architecture layout we've been using for the other models..."

And you'd be right.

The difference again is in the layers component. Instead of using an LSTM or GRU cell, we're going to use a tensorflow.keras.layers.Conv1D() layer followed by a tensorflow.keras.layers.GlobablMaxPool1D() layer.

    📖 Resource: The intuition here is explained succinctly in the paper Understanding Convolutional Neural Networks for Text Classification, where they state that CNNs classify text through the following steps:

- dimensional convolving filters are used as ngram detectors, each filter specializing in a closely-related family of ngrams (an ngram is a collection of n-words, for example, an ngram of 5 might result in "hello, my name is Daniel").
- Max-pooling over time extracts the relevant ngrams for making a decision.
- The rest of the network classifies the text based on this information.

In [81]:
# Test out oue Embedding layer, conv1d layer and max pooling layer
from tensorflow.keras import layers

embedding_test = embedding(text_vectorizer(["this is a test sentence"])) # turn target sequence to embedding
conv_1d = layers.Conv1D(filters = 32,
                       kernel_size =5, # this is also referred to as an ngram of 5 (meaning it looks at 5 words at a time)
                       activation = "relu",
                       padding = "same") # defalt = "valid", the output is smaller than the input shape. "same" means output shape is the same as th input 
conv_1d_output = conv_1d(embedding_test) # pass the embedding through conv1d layer
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output) # equivalent to "get the most important featuer" or "get the feature with the highest value"

print(embedding_test.shape)
print(conv_1d_output.shape)
print(max_pool_output.shape)

(1, 15, 128)
(1, 15, 32)
(1, 32)




Notice the output shapes of each layer.

The embedding has an output shape dimension of the parameters we set it to (input_length=15 and output_dim=128).

The 1-dimensional convolutional layer has an output which has been compressed inline with its parameters. And the same goes for the max pooling layer output.

Our text starts out as a string but gets converted to a feature vector of length 64 through various transformation steps (from tokenization to embedding to 1-dimensional convolution to max pool).

Let's take a peak at what each of these transformations looks like.


In [82]:
# See the outputs of each layer
embedding_test[:1], conv_1d_output[:1], max_pool_output[:1]

(<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
 array([[[ 0.02534914, -0.03109059,  0.00285616, ..., -0.0078316 ,
          -0.02685576, -0.04434131],
         [-0.0658626 ,  0.09451494, -0.01477602, ..., -0.00657781,
          -0.0423879 ,  0.07777896],
         [-0.04803652, -0.00709757, -0.02330893, ..., -0.0180733 ,
           0.02351034,  0.02676384],
         ...,
         [ 0.00073164,  0.01504801, -0.03425453, ..., -0.04403543,
          -0.01042281,  0.01876438],
         [ 0.00073164,  0.01504801, -0.03425453, ..., -0.04403543,
          -0.01042281,  0.01876438],
         [ 0.00073164,  0.01504801, -0.03425453, ..., -0.04403543,
          -0.01042281,  0.01876438]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 15, 32), dtype=float32, numpy=
 array([[[0.        , 0.        , 0.        , 0.        , 0.        ,
          0.00736916, 0.        , 0.01438061, 0.03620387, 0.        ,
          0.0286228 , 0.0236714 , 0.        , 0.03180662, 0.06328546,
          0.07848544, 0.

In [83]:
# create embedding layer for Conv1d model
tf.random.set_seed(42)

from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim = max_vocab_length,
                                    output_dim = 128,
                                    embeddings_initializer = "uniform",
                                    input_length = max_length,
                                    name = "embedding_5")

In [84]:
# Create 1D conv layer to model sequence
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=64,
                  kernel_size=5,
                  activation="relu",
                  padding="valid")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu") (x) # optinal dense layer

outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

# compile the model
model_5.compile(loss = "binary_crossentropy",
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ["accuracy"])

In [85]:
# get the model summary
model_5.summary()

Model: "model_5_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 1)              

In [86]:
# Fit the model
model_5_history = model_5.fit(train_sentences,
                             train_labels,
                             epochs = 5,
                             validation_data = (val_sentences, val_labels),
                             callbacks = [create_tensorboard_callback(SAVE_DIR,
                                                                      "model_5_conv1D")])

Saving TensorBoard log files to: model_log/model_5_conv1D/20220112-161516
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [87]:
model_5.evaluate(val_sentences, val_labels)



[0.7038962841033936, 0.7887139320373535]

In [88]:
# make predictions
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

array([[0.543736  ],
       [0.8092995 ],
       [0.9998087 ],
       [0.03838049],
       [0.00113677],
       [0.99150544],
       [0.9677432 ],
       [0.998509  ],
       [0.99936634],
       [0.28771225]], dtype=float32)

In [89]:
# convert probabilities to classes
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [90]:
# calculate the model performace metrics
model_5_results = calculate_results(y_true = val_labels,
                                   y_pred = model_5_preds)
model_5_results

{'accuracy': 78.87139107611549,
 'precision': 0.7926581572076621,
 'recall': 0.7887139107611548,
 'f1': 0.7860944810879305}

In [91]:
# compare model_5 results to the baseline model
compare_baseline_to_new_results(baseline_results, 
                       model_5_results)

Baseline accuracy: 79.27, New accuracy: 78.87, Difference: -0.39
Baseline precision: 0.81, New precision: 0.79, Difference: -0.02
Baseline recall: 0.79, New recall: 0.79, Difference: -0.00
Baseline f1: 0.79, New f1: 0.79, Difference: -0.00


# Transfer Learning Model

In [94]:
# create a tensorboard callback (need to create a new one for each model)
from helper_functions import create_tensorboard_callback

# create directory to save Tensorboard logs
SAVE_DIR = "model_log"

In [95]:
sample_sentence = "There's a flood in my street!"

In [96]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

embed_samples = embed([sample_sentence,
                      "when you call the Universal Senetence Encoder on a sentence, it turns into numbers"])

print(embed_samples[0][:50])

tf.Tensor(
[-0.01157027  0.0248591   0.02878048 -0.012715    0.03971538  0.0882776
  0.02680985  0.05589838 -0.01068729 -0.00597292  0.00639323 -0.0181952
  0.00030814  0.09105888  0.05874645 -0.03180628  0.01512474 -0.05162929
  0.00991367 -0.06865346 -0.04209305  0.0267898   0.03011008  0.00321069
 -0.00337971 -0.04787356  0.02266719 -0.00985925 -0.04063613 -0.01292093
 -0.04666384  0.056303   -0.03949255  0.00517688  0.02495828 -0.07014441
  0.02871508  0.04947684 -0.00633978 -0.08960193  0.02807117 -0.00808362
 -0.01360601  0.0599865  -0.10361787 -0.05195374  0.00232955 -0.0233253
 -0.03758105  0.03327729], shape=(50,), dtype=float32)


In [97]:
# Each sentence has been encoded into a 512 dimension vector
embed_samples[0].shape

TensorShape([512])

In [98]:
# We can use this encoding layer in place of our text_vectorizer and embedding layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run Identity: Dst tensor is not initialized. [Op:Identity]