In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 2060 (UUID: GPU-1191f81c-7fba-75e4-915b-9616e90b1b53)


In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2022-02-06 18:24:30--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: 'helper_functions.py'

     0K ..........                                            100% 14.7M=0.001s

2022-02-06 18:24:30 (14.7 MB/s) - 'helper_functions.py' saved [10246/10246]



In [4]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [5]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

--2022-02-07 09:20:52--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.112, 172.217.13.144, 172.217.13.176, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: 'nlp_getting_started.zip'

     0K .......... .......... .......... .......... ..........  8% 19.6M 0s
    50K .......... .......... .......... .......... .......... 16% 20.0M 0s
   100K .......... .......... .......... .......... .......... 25% 19.2M 0s
   150K .......... .......... .......... .......... .......... 33% 19.8M 0s
   200K .......... .......... .......... .......... .......... 42% 19.1M 0s
   250K .......... .......... .......... .......... .......... 50% 19.3M 0s
   300K .......... .......... .......... .......... .......... 59% 18.3M 0s
   350K .......... .......... ..........

In [6]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df_shuffled['text'].to_numpy(), train_df_shuffled['target'].to_numpy(), test_size=0.1, random_state=42)

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6851,), (762,), (6851,), (762,))

In [15]:
## Use tokenization or Embeddings

# - create your own embedding

# - re-use a pre-learned embedding

In [16]:
## preprocessing layer for tokenization

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [18]:
text_vectorizer = TextVectorization(
    max_tokens=None, # words in the vocabulary
    standardize='lower_and_strip_punctuation', # how to process text
    split='whitespace', # how to split tokens
    ngrams=None, # create groups of n words
    output_mode='int', # how to map tokens to numbers
    output_sequence_length=None # how long should the output sequence of tokens be?
)

In [19]:
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [21]:
text_vectorizer.adapt(X_train)

In [22]:
sample_sentence = "There's a flood in my street"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [24]:
import random
random_sentence = random.choice(X_train)
print(f"Original sentence '{random_sentence}'.")
text_vectorizer([random_sentence])

Original sentence 'someone: mentions gansey on fire
me busting through the brick walls of seven different buildings:'.


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 475, 2050,    1,   11,   42,   31,    1,  295,    2, 6067, 3266,
           6, 1766, 1215,   95]], dtype=int64)>

In [25]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of words in vocab {len(words_in_vocab)}")
print(f"Top 5 most common words : {top_5_words}")
print(f"Bottom 5 least common words : {bottom_5_words}")

Number of words in vocab 10000
Top 5 most common words : ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words : ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


## Create an Embedding using an Embedding layer

In [26]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim=max_vocab_length,
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=max_length,
    name='embedding_1',
    )

In [27]:
random_sentence = random.choice(X_train)
print(f"Original text : '{random_sentence}'.")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed


Original text : 'With a sinking music video tv career Brooke Hogan should be THANKING her Dad for the free publicity...although I doubt it will help her.'.


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04978979, -0.0019431 , -0.00039934, ..., -0.01621551,
          0.03926425,  0.02467387],
        [-0.04284013, -0.01489798, -0.0159496 , ..., -0.01166106,
          0.03061062,  0.01972148],
        [ 0.0386066 ,  0.01791922,  0.04897166, ...,  0.02531831,
         -0.00777938,  0.04759762],
        ...,
        [ 0.00528262, -0.04030708,  0.04332492, ...,  0.00316782,
          0.03644954, -0.00621504],
        [-0.04111345, -0.03443157, -0.00189682, ..., -0.01965671,
          0.04853431, -0.01313479],
        [-0.03508195,  0.01448471,  0.00513798, ...,  0.04485155,
          0.03976364, -0.03543619]]], dtype=float32)>

In [28]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.04978979, -0.0019431 , -0.00039934,  0.00179777,  0.00696483,
        0.02826217, -0.00501275, -0.03520845, -0.01997205, -0.02367836,
       -0.0155376 , -0.00187721,  0.0459861 , -0.02331309,  0.02476053,
       -0.04370702,  0.02071167,  0.02952768,  0.02044065,  0.0423478 ,
        0.02545262,  0.01628632, -0.04094479, -0.03996222,  0.04462297,
       -0.03908013,  0.04199353, -0.01610132,  0.01564901, -0.013139  ,
        0.00597284,  0.00877153, -0.03182314,  0.02276621, -0.02383438,
        0.03765985, -0.02755485, -0.02503128,  0.02739936,  0.03520251,
        0.0371048 ,  0.03949472,  0.03401914,  0.04591072,  0.04174298,
        0.01891393, -0.03835064,  0.01907009,  0.03154561, -0.03021329,
       -0.04740628,  0.03474759,  0.03825137, -0.02400986, -0.01284285,
       -0.00407534,  0.03015329,  0.02578986,  0.04487634,  0.00563987,
        0.0126985 , -0.04686456, -0.02622954,  0.01527597,  0.01179849,
        0.007423