In [6]:
def setup():
    # Download helper functions script
    !wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
    # Import series of helper functions for the notebook
    from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys
    # Download data (same as from Kaggle)
    !wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"
    # Unzip data
    unzip_data("nlp_getting_started.zip")

setup()

--2024-01-18 23:17:45--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.2’


2024-01-18 23:17:45 (48.4 MB/s) - ‘helper_functions.py.2’ saved [10246/10246]

--2024-01-18 23:17:45--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.76.187, 142.250.183.91, 142.251.42.27, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.76.187|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.2’


2024-01-18 23:17:47 (8.

## Understanding Vectorize & Embedding

In [7]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [8]:
# Turn .csv files into pandas DataFrame's
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
max_vocab_size              = 5000      # Maximum vocab size.
text_to_vector_dumb_length  = 15        # Sequence length to pad the outputs to.

# Create the layer.
text_to_vectorizer_dumb = tf.keras.layers.TextVectorization( max_tokens= max_vocab_size, output_mode='int', output_sequence_length= text_to_vector_dumb_length)

entire_text = ["entire text goes here to build vocabulary"]
text_to_vectorizer_dumb.adapt(entire_text)

print(text_to_vectorizer_dumb.get_vocabulary())
# UNK = Unknown Word. For any new word which might

text_to_vectorizer_dumb(["entire"])
text_to_vectorizer_dumb(["text"])
text_to_vectorizer_dumb(["goes"])
text_to_vectorizer_dumb(["here"])
text_to_vectorizer_dumb(["entire text goes here build"])


['', '[UNK]', 'vocabulary', 'to', 'text', 'here', 'goes', 'entire', 'build']


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=array([[7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=array([[4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=array([[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=array([[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=array([[7, 4, 6, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

## Training Dataset

In [10]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df["text"].to_numpy(), train_df["target"].to_numpy(), test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

### Vectorize

In [13]:
max_vocab_size              = 50000      # Total Number of Words. One word mapped to one number. (Max vocab size)
text_to_vector_dumb_length  = 50         # Number of Words in a Sentence. (Sequence length to pad the outputs to.)
text_to_vectorizer_dumb     = tf.keras.layers.TextVectorization( max_tokens= max_vocab_size, output_mode='int', output_sequence_length= text_to_vector_dumb_length)

# Build vocabulary & integer mapping from training data. Method => `adapt` ****
text_to_vectorizer_dumb.adapt(train_sentences)

print(text_to_vectorizer_dumb.get_vocabulary())
len(text_to_vectorizer_dumb.get_vocabulary())

# UNK = Unknown Word. For any new word which might
# vocabulary is based on frequency of the word. "a, an, the" most frequent words.
top_5_words    = text_to_vectorizer_dumb.get_vocabulary()[:5]  # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = text_to_vectorizer_dumb.get_vocabulary()[-5:] # least common tokens




21089

In [None]:
text_to_vectorizer_dumb(["good morning, hello world", "second sentence for 2nd vector"])

### Embedding

In [15]:
import numpy as np

# one word mapped to a vector containing its meaning. 
vector_with_embedded_meaning_dim   = 64
embed_meaning_in_vector_layer = tf.keras.layers.Embedding(output_dim= vector_with_embedded_meaning_dim, input_dim = max_vocab_size, input_length= text_to_vector_dumb_length)

dumb_vector             = text_to_vectorizer_dumb(["good morning, hello world"])
vector_with_embedded_meaning = embed_meaning_in_vector_layer(dumb_vector)
vector_with_embedded_meaning

# EMBED MEANING => EMBEDDING


<tf.Tensor: shape=(1, 50, 64), dtype=float32, numpy=
array([[[-0.020926  ,  0.03570745, -0.01570611, ..., -0.03111103,
          0.0311957 ,  0.00090834],
        [ 0.0010921 ,  0.037993  , -0.04221752, ...,  0.00628193,
          0.00900203, -0.02094783],
        [ 0.02979347, -0.00238502, -0.04763527, ..., -0.04642969,
          0.0143331 ,  0.03021241],
        ...,
        [-0.03795617,  0.01420427,  0.01248878, ...,  0.04375717,
         -0.00041147,  0.0461457 ],
        [-0.03795617,  0.01420427,  0.01248878, ...,  0.04375717,
         -0.00041147,  0.0461457 ],
        [-0.03795617,  0.01420427,  0.01248878, ...,  0.04375717,
         -0.00041147,  0.0461457 ]]], dtype=float32)>

In [16]:
# Layer which builds meaning into 64 dims, contains following number of parameters. In maths language
keras.models.Sequential(embed_meaning_in_vector_layer).summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 64)            3200000   
                                                                 
Total params: 3200000 (12.21 MB)
Trainable params: 3200000 (12.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Models

#### Model Architecture - 1 Hidden Layer = Global Average Pooling 1D

In [17]:
# Build model with the Subclassing API
"""
    NN Architecture for NLP
    1. Input Layer
    2. Vectorize Input
    3. Embed Meaning in vector
    4. <Hidden Layers>. (In this case: Global Average Pooling)
    5. Output Layer 
"""

class Model_1_Subclassing(keras.Model):
    def __init__(self):
        super().__init__()
        self.input_layer    = keras.layers.Input(shape=(1,), dtype="string")
        self.vectorize      = text_to_vectorizer_dumb
        self.embedding      = embed_meaning_in_vector_layer
        self.global_average = keras.layers.GlobalAveragePooling1D() # one word of 64 dims, containing meaning of all the words
        self.output_layer   = keras.layers.Dense(1  , activation="sigmoid")

    def call(self, single_batch):
        vectorize_output        = self.vectorize        (single_batch)
        embedding_output        = self.embedding        (vectorize_output)
        global_average_output   = self.global_average   (embedding_output)
        final_layer_output      = self.output_layer     (global_average_output)

        return final_layer_output

model_1 = Model_1_Subclassing()
# Compile model
model_1.          compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"], run_eagerly= True)
model_1_history = model_1.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels))

'\n    NN Architecture for NLP\n    1. Input Layer\n    2. Vectorize Input\n    3. Embed Meaning in vector\n    4. <Hidden Layers>. (In this case: Global Average Pooling)\n    5. Output Layer \n'



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_1.summary()

#### Model Architecture: 2 Hidden Layers. GAP & FC/Dense Layer

In [18]:
class Model_Hidden_Layer(keras.Model):
    def __init__(self):
        super().__init__()
        self.input_layer    = keras.layers.Input(shape=(1,), dtype="string")
        self.vectorize      = text_to_vectorizer_dumb
        self.embedding      = embed_meaning_in_vector_layer
        self.global_average = keras.layers.GlobalAveragePooling1D()
        self.hidden_layer_1 = keras.layers.Dense(100, activation="relu")
        self.output_layer   = keras.layers.Dense(1  , activation="sigmoid")

    def call(self, single_batch):
        vectorize_output        = self.vectorize        (single_batch)
        embedding_output        = self.embedding        (vectorize_output)
        global_average_output   = self.global_average   (embedding_output)
        hidden_layer_1_output   = self.hidden_layer_1   (global_average_output)
        final_layer_output      = self.output_layer     (hidden_layer_1_output)

        return final_layer_output

model_hidden_layer = Model_Hidden_Layer()
model_hidden_layer.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"], run_eagerly= True)
model_hidden_layer = model_hidden_layer.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels))




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
