In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import os

In [None]:
!nvidia-smi -L

GPU 0: Tesla K80 (UUID: GPU-d8f06b09-7afd-3d65-930a-3061b8476b68)


In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

#Get helper functions
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2022-05-05 18:21:45--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2022-05-05 18:21:45 (49.1 MB/s) - ‘helper_functions.py’ saved [10246/10246]



Get a text dataset (Kaggle's Introduction to NLP dataset - Disaster or not disaster tweets dataset)

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

#Unzip dataset
unzip_data('nlp_getting_started.zip')

--2022-05-05 18:21:45--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.200.128, 209.85.234.128, 108.177.112.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.200.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2022-05-05 18:21:46 (47.0 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
#For us to import data using pandas, well need to fit all the data on the ram, which fails when the data size is huge
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
#Shuffle dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
#Test dataframe
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
#Check if the dataset is balanced
train_df_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
#Number of total samples
len(train_df_shuffled), len(test_df)

(7613, 3263)

In [None]:
#Lets visualize some random training samples
random_index= random.randint(0, len(train_df_shuffled)-5)
for row in train_df_shuffled[['text', 'target']][random_index:random_index+5].itertuples():
  _, text, target = row
  print (f"Target : {target}","(real disaster)" if target > 0 else "(not real disaster)")
  print (f"Text : \n{text}\n")
  print ("------\n")

Target : 1 (real disaster)
Text : 
Evacuation order lifted for Roosevelt after #Wildfire misses town - KOMO News http://t.co/qCpMktGLLR

------

Target : 0 (not real disaster)
Text : 
Germany has  39 gigawatts of installed solar capacity
_One gwatt is about equal to the capacity of a nuclear reactor.
http://t.co/leCZOlkmSV

------

Target : 1 (real disaster)
Text : 
Property losses from California wildfire nearly double as week-old blaze rages http://t.co/E0UUsnpsq5

------

Target : 1 (real disaster)
Text : 
Madhya Pradesh Train Derailment: Village Youth Saved Many Lives

------

Target : 0 (not real disaster)
Text : 
@FNAF_TalkMC *stands there engulfed in the fire smiling*

------



###Create training and validation splits


In [None]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [None]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

###Converting Text to Numbers (Tokenization)

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
#Using default Parameters
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize='lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None,
                                    output_mode='int',
                                    output_sequence_length=None,
                                    pad_to_max_tokens=False
                                    )

In [None]:
#Find the average number of tokens in the training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [None]:
#Now set up vectorization variables
max_vocab_length=10000
max_length=15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length,
                                    )

In [None]:
#Fit the vectorizer on the training data
text_vectorizer.adapt(train_sentences)

In [None]:
#Create a sample sentence and tokenize it
sample_sentence = "Bishwas!!!, Theres a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  1, 264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [None]:
#randomly choose a sample from train sentences and see
random_sentence = random.choice(train_sentences)
print (f'Original Sentence :\n {random_sentence}\
        \n\nVectorized Version : \n')
text_vectorizer([random_sentence])

Original Sentence :
 #TheDoolingGroup 2 injured when 2 school buses collide - åÊ #BREAKING: School bus slams into school bus in Bordento... http://t.co/YQHfio9XQm        

Vectorized Version : 



<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[7657,   70,  243,   45,   70,  185, 1318,  517, 2585,  379,  185,
         352, 8428,   66,  185]])>

In [None]:
#get the unique words in the vocab
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print (f'Number of words in vocab : {len(words_in_vocab)}')
print (f'Top 5 words : {top_5_words}')
print (f'Bottom 5 words : {bottom_5_words}')

Number of words in vocab : 10000
Top 5 words : ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 words : ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [None]:
###Creating an embedding layer using Tensorflows embedding layer
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = max_vocab_length,
                             output_dim = 128,
                             input_length = max_length)
embedding

<keras.layers.embeddings.Embedding at 0x7f6fc00e4a10>

In [None]:
#randomly choose a sample from train sentences and see
random_sentence = random.choice(train_sentences)
print (f'Original Sentence :\n {random_sentence}\
        \n\nEmbedded Version : \n')
embedding(text_vectorizer([random_sentence])) #because embedding layer takes integers as input and not words

Original Sentence :
 Arson suspect linked to 30 fires caught in Northern California - Los Angeles Times http://t.co/PrRB4fhXtv        

Embedded Version : 



<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01045424,  0.02539333, -0.01793944, ..., -0.02770909,
          0.02899114,  0.0439996 ],
        [-0.02746434,  0.0491311 ,  0.02593211, ..., -0.02201183,
         -0.03034258, -0.02571818],
        [-0.04715477,  0.04065189,  0.01256293, ...,  0.00046393,
         -0.02088914,  0.00296171],
        ...,
        [-0.0127307 , -0.027483  ,  0.00578717, ..., -0.00839202,
         -0.02671483,  0.02659321],
        [-0.01210526, -0.04483822,  0.03039796, ..., -0.02578837,
         -0.00402967,  0.03287086],
        [-0.04269737, -0.00895267,  0.0302148 , ..., -0.04720119,
         -0.00367616,  0.00310628]]], dtype=float32)>

In [None]:
#Check out a single tokens embedding
embedding(text_vectorizer([random_sentence]))[0][0], embedding(text_vectorizer([random_sentence]))[0][0].shape

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.01045424,  0.02539333, -0.01793944,  0.0062318 , -0.03665435,
        -0.03785205,  0.02049949, -0.00856379, -0.00630745, -0.00482694,
         0.02219259, -0.0042155 ,  0.04596363, -0.00896001,  0.03770569,
        -0.0390665 , -0.04790983, -0.01924187,  0.0369289 , -0.00859884,
        -0.03806525,  0.02824047,  0.04055974, -0.04992533, -0.00328238,
        -0.03085488, -0.02868376,  0.03146696,  0.03172812, -0.00484655,
         0.03174594, -0.02435772, -0.03060722, -0.01693331, -0.01962663,
         0.01118491,  0.00476348, -0.02762185, -0.00530286, -0.00574281,
         0.02480047,  0.00674679, -0.00134458, -0.01926309,  0.03732301,
        -0.00313086, -0.00587652, -0.00818241,  0.01549138,  0.02783347,
        -0.03108292,  0.01313544, -0.04992604, -0.01422976,  0.02809553,
         0.02194649,  0.02153719,  0.0441472 , -0.04439013,  0.0486823 ,
         0.02748602, -0.01387423, -0.02952758,  0.0215994 , -0.03788038,
  

###Model zero : Baseline on Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#Create a tokenizer and modelling pipeline
model_0 = Pipeline([
                    ('tfidf',TfidfVectorizer()),
                     ('clf', MultinomialNB())
])


#Fit the pipeline
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [None]:
#Evaluate the model
baseline_score = model_0.score(val_sentences, val_labels)
print (f'Our baseline model achieves an accuracy of : {baseline_score*100}%')

Our baseline model achieves an accuracy of : 79.26509186351706%


In [None]:
baseline_pred = model_0.predict(val_sentences)
baseline_pred[:5]

array([1, 1, 1, 0, 0])

In [None]:
#Creating a function to calculate all the evaluation metrics

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred)*100
  model_precision, model_recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

  model_results = {"accuracy" : model_accuracy,
                   "precision" : model_precision,
                   "recall" : model_recall,
                   "f1_score" : f1_score}

  return model_results

In [None]:
baseline_results = calculate_results(val_labels, baseline_pred)
baseline_results

{'accuracy': 79.26509186351706,
 'f1_score': 0.7862189758049549,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706}

###Model2 - FNN Model

In [None]:
#Create a tensorboard callback
from helper_functions import create_tensorboard_callback

#Create a directory to save Tensorboard logs
SAVE_DIR = 'model_logs'

In [None]:
#Build a model with the functional APIs
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name='model1_dense')

In [None]:
model_1.summary()

Model: "model1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
No

In [None]:
#Compile model
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
#Fit the model
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, experiment_name='model_1_dense')])

Saving TensorBoard log files to: model_logs/model_1_dense/20220505-182153
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_1.evaluate(val_sentences, val_labels)



[0.4798804521560669, 0.7755905389785767]

In [None]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape

(762, 1)

In [None]:
#convert these probabilities to a label format
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds.shape

TensorShape([762])

In [None]:
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results

{'accuracy': 77.55905511811024,
 'f1_score': 0.7736182129212565,
 'precision': 0.7772070861555818,
 'recall': 0.7755905511811023}

###Visualizing learned embeddings


In [None]:
#Get the words from vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [None]:
#Model1 summary
model_1.summary()

Model: "model1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
No

In [None]:
#Get the weight matrix of the embedding layer
#(these are numerical representations of each token in our training data)
embed_weights = model_1.get_layer('embedding').get_weights()[0]
embed_weights, embed_weights.shape

(array([[-0.05234158, -0.00271497,  0.03789909, ..., -0.05665021,
         -0.01011819, -0.00186358],
        [-0.01541384, -0.04331435,  0.03282684, ..., -0.02902712,
         -0.00456407,  0.03225722],
        [-0.0414076 ,  0.0464121 , -0.0061805 , ..., -0.05393603,
         -0.05437831, -0.03337459],
        ...,
        [-0.03286456, -0.01657138, -0.03720272, ...,  0.04744336,
          0.006946  ,  0.00129831],
        [-0.02876865,  0.01683546,  0.01304436, ..., -0.03466458,
         -0.00626923, -0.02222297],
        [-0.01540418,  0.11091513,  0.09716233, ..., -0.02116515,
         -0.02872247, -0.02909903]], dtype=float32), (10000, 128))

In [None]:
#Now weve got the embedding matrix our model has learned to represent our tokens. we use a website called https://projector.tensorflow.org/

#Code copied from word embeddings tf documentation

import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()


In [None]:
#Download files from colab to upload to projector tools

try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>