## Get data

In [8]:
# Only use full for Google colab
#!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip


## Explore Data

In [9]:
import pandas as pd
df_train = pd.read_csv("../data/nlp_getting_started/train.csv")
df_test = pd.read_csv("../data/nlp_getting_started/test.csv")
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [11]:
# percentage of samples in training and test set
print(f"Percentage of samples in training set: {len(df_train)/(len(df_train)+len(df_test))*100:.2f}%")
print(f"Percentage of samples in test set: {len(df_test)/(len(df_train)+len(df_test))*100:.2f}%")

Percentage of samples in training set: 70.00%
Percentage of samples in test set: 30.00%


In [12]:
# Shuffle training dataframe
df_train_shuffled = df_train.sample(frac=1, random_state=42)
df_train_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [13]:
# unique labels and their counts
print(df_train_shuffled["target"].value_counts())

# what is the percentage of disaster tweets
print("percentage of disaster tweets : ", round(len(df_train_shuffled[df_train_shuffled["target"] == 1]) / len(df_train_shuffled) * 100, 2), "%")

# what is the percentage of non-disaster tweets
print("percentage of non - disaster tweets : ", round(len(df_train_shuffled[df_train_shuffled["target"] == 0]) / len(df_train_shuffled) * 100, 2), "%")


0    4342
1    3271
Name: target, dtype: int64
percentage of disaster tweets :  42.97 %
percentage of non - disaster tweets :  57.03 %


In [14]:
# count of samples in each split
len(df_train_shuffled), len(df_test)

(7613, 3263)

In [15]:
# Visualize random training examples
import random
random_index = random.randint(0, len(df_train_shuffled) - 5) # create random indexes not exceeding length of dataframe
for row in df_train_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
    _,current_text, current_target = row
    print(f"Target: {current_target}", "(real disaster)" if current_target > 0 else "(not real disaster)")
    print(f"Text:\n{current_text}\n")
    print("---\n")

Target: 0 (not real disaster)
Text:
Hey girl you must be Toe Hobbit: Part Two: ghe Desolation of Smaug because I'm not interested in seeing you. Sorry.

---

Target: 0 (not real disaster)
Text:
Texas Seeks Comment on Rules for Changes to Windstorm Insurer http://t.co/BNNIdfZWbd

---

Target: 0 (not real disaster)
Text:
The things we fear most in organizations--fluctuations disturbances imbalances--are the primary sources of creativity. - Margaret Wheatley

---

Target: 0 (not real disaster)
Text:
FedEx no longer to transport bioterror germs in wake of anthrax lab mishaps http://t.co/qZQc8WWwcN via @usatoday

---

Target: 0 (not real disaster)
Text:
Am now repped by the fantastic Laura Milne @TheJonesesVoice for all your liguistic needs. And that's some tongue twister tweets

---



## split the data

In [16]:
# split data into training and validation sets
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(df_train_shuffled["text"].to_numpy(),
                                                                            df_train_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # use 10% of training data for validation split
                                                                            random_state=42)

# check the number of samples in training and validation sets
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [17]:
# check the percentage of disaster and non-disaster tweets in training and validation sets
print("percentage of disaster tweets in training set : ", round(len(train_labels[train_labels == 1]) / len(train_labels) * 100, 2), "%")
print("percentage of non-disaster tweets in training set : ", round(len(train_labels[train_labels == 0]) / len(train_labels) * 100, 2), "%")
print("percentage of disaster tweets in validation set : ", round(len(val_labels[val_labels == 1]) / len(val_labels) * 100, 2), "%")
print("percentage of non-disaster tweets in validation set : ", round(len(val_labels[val_labels == 0]) / len(val_labels) * 100, 2), "%")

percentage of disaster tweets in training set :  42.67 %
percentage of non-disaster tweets in training set :  57.33 %
percentage of disaster tweets in validation set :  45.67 %
percentage of non-disaster tweets in validation set :  54.33 %


In [18]:
# check the first 10 samples in training sentences and labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

In [19]:
# check the first 10 samples in validation sentences and labels
val_sentences[:10], val_labels[:10]

(array(['DFR EP016 Monthly Meltdown - On Dnbheaven 2015.08.06 http://t.co/EjKRf8N8A8 #Drum and Bass #heavy #nasty http://t.co/SPHWE6wFI5',
        'FedEx no longer to transport bioterror germs in wake of anthrax lab mishaps http://t.co/qZQc8WWwcN via @usatoday',
        'Gunmen kill four in El Salvador bus attack: Suspected Salvadoran gang members killed four people and wounded s... http://t.co/CNtwB6ScZj',
        '@camilacabello97 Internally and externally screaming',
        'Radiation emergency #preparedness starts with knowing to: get inside stay inside and stay tuned http://t.co/RFFPqBAz2F via @CDCgov',
        'Investigators rule catastrophic structural failure resulted in 2014 Virg.. Related Articles: http://t.co/Cy1LFeNyV8',
        'How the West was burned: Thousands of wildfires ablaze in #California alone http://t.co/iCSjGZ9tE1 #climate #energy http://t.co/9FxmN0l0Bd',
        "Map: Typhoon Soudelor's predicted path as it approaches Taiwan; expected to make landfall over so

In [20]:
# check the percentage of samples in training, validation and test sets of the original dataframe

print("percentage of samples in training set : ", round(len(train_sentences) / (len(df_train_shuffled)+len(df_test)) * 100, 2), "%")
print("percentage of samples in validation set : ", round(len(val_sentences) / (len(df_train_shuffled)+len(df_test)) * 100, 2), "%")
print("percentage of samples in test set : ", round(len(df_test) / (len(df_train_shuffled)+len(df_test)) * 100, 2), "%")


percentage of samples in training set :  62.99 %
percentage of samples in validation set :  7.01 %
percentage of samples in test set :  30.0 %


## Tokenization / Embedding of text

* Tokenization : Direct mapping of token to number

* Embedding - create a feature vector for a token

### Tokenizing

In [39]:
# Get average number of tokens (words) in all training sentences
def avg_word_length(sentences):
    """
    Returns the average number of words per sentence.
    """
    return round(sum([len(i.split()) for i in sentences]) / len(sentences))



# Get the average word length in training set
train_avg_word_len = avg_word_length(train_sentences)

# Get the average word length in validation set
val_avg_word_len = avg_word_length(val_sentences)

# Get the average word length in test set
test_avg_word_len = avg_word_length(df_test["text"])


train_avg_word_len, val_avg_word_len, test_avg_word_len



(15, 15, 15)

In [44]:
# tokenize text using tensorflow layer
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

#create TextVectorization object
text_vectorizer = TextVectorization(max_tokens=10000, # how many words in the vocabulary (automatically add <OOV>)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=15, # how long should the output sequence of tokens be
                                    pad_to_max_tokens=True) # whether to pad sequences to the longest sequence or not

                                    


In [45]:
# set text_vectorizer variables on training text
text_vectorizer.adapt(train_sentences)


In [47]:
# create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [48]:
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
@stormbeard @steel_lord I seen Judas Priest in 2005 when Rob came back; Scorpions as support. Fucking annihilated the place. Astonishing gig        

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[8052, 8103,    8,  834,    1, 9579,    4, 4183,   45, 2735,  440,
          88, 4677,   26,  724]])>

In [49]:
# get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"5 most common words: {top_5_words}")
print(f"5 least common words: {bottom_5_words}")


Number of words in vocab: 10000
5 most common words: ['', '[UNK]', 'the', 'a', 'in']
5 least common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Embedding

In [50]:
# Creating an Embedding using an Embedding Layer
from tensorflow.keras.layers import Embedding

# create an embedding layer
embedding = Embedding(input_dim=10000, # size of our vocabulary
                        output_dim=128, # set the size of the embedding vector
                        embeddings_initializer="uniform", # default, initialize embedding layer
                        input_length=15) # how long is each input sentence

# get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
        \n\nEmbedded version:")
# embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed


Original text:
'Trust us to get rescued by the dopey ones!' Val is hilarious shame she's probably going to die #emmerdale        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 1.28370523e-03, -1.90831181e-02,  4.44911420e-04, ...,
          3.32583673e-02,  8.70022923e-03,  4.41428311e-02],
        [ 1.17532611e-02, -8.49142671e-05, -1.31330267e-02, ...,
         -3.43974680e-03, -1.58836134e-02,  3.44605371e-03],
        [ 4.76971157e-02, -1.42902359e-02, -3.63789573e-02, ...,
          4.03727032e-02,  1.95386522e-02,  4.64261808e-02],
        ...,
        [ 1.10161789e-02, -4.61054444e-02, -3.90307792e-02, ...,
         -1.00136995e-02,  2.85941698e-02,  1.30464546e-02],
        [ 4.84103896e-02,  4.97270562e-02, -3.28893885e-02, ...,
          4.05841731e-02,  3.66923548e-02,  1.70076229e-02],
        [ 4.10092585e-02, -3.94623391e-02,  4.76190187e-02, ...,
          2.38736533e-02, -4.16267030e-02, -3.04843914e-02]]],
      dtype=float32)>