In [1]:
import pandas as pd
train_df = pd.read_csv(r'D:\1jupyter\Datasets\NLP disaster\train.csv')
test_df = pd.read_csv(r'D:\1jupyter\Datasets\NLP disaster\test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
train_df_shuffled = train_df.sample(frac=1, random_state=42) 
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [3]:
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [4]:
import random
random_index = random.randint(0, len(train_df)-5) 
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (real disaster)
Text:
I was taught at school in the 1970s that piracy slavery and suicide-bombing were purely historical. No one then expected them to re-occur

---

Target: 1 (real disaster)
Text:
Petition/No Medals for 1890 Massacre Justice for Wounded Knee Killings of Native Americans! http://t.co/UilPg8i1ev http://t.co/m9pXTo2kwW

---

Target: 1 (real disaster)
Text:
70 years since we annihilated 100000 people instantly and became aware that we have the ability to annihilate the whole of humanity

---

Target: 0 (not real disaster)
Text:
@_DANGdaddy the sirens are telling you to get ready to TURN UP???????? http://t.co/qAQqrJv9gU

---

Target: 1 (real disaster)
Text:
KATUNews: #SR14 remains closed as brush fire burns 1700 acres: http://t.co/QposKp3MWj #LiveOnK2 http://t.co/mTQjsvupwy

---



In [5]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1, random_state=42) 

In [6]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [7]:
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize='lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None,
                                    output_mode='int',
                                    output_sequence_length=None)
                                    #pad_to_max_tokens=True) if using max_tokens
                                   

In [9]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [10]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

In [11]:
text_vectorizer.adapt(train_sentences)

In [12]:
words_in_vocab = text_vectorizer.get_vocabulary()
print(f'words in vocab: {len(words_in_vocab)}')
print(f'most common words: {words_in_vocab[:5]}')
print(f'leat common words: {words_in_vocab[-5:]}')

words in vocab: 10000
most common words: ['', '[UNK]', 'the', 'a', 'in']
leat common words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [13]:
tf.random.set_seed(42)
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                      output_dim=128,
                                      embeddings_initializer='uniform',
                                      input_length=max_length,
                                      name='embedding')

embedding

<keras.layers.core.embedding.Embedding at 0x23b59136070>

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [15]:
model0 = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf', MultinomialNB())
])

model0.fit(train_sentences,train_labels)

In [16]:
score0 = model0.score(val_sentences,val_labels)

In [17]:
score0

0.7926509186351706

In [18]:
pred0 = model0.predict(val_sentences)

In [19]:
pred0

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [20]:
from sklearn.metrics import classification_report,confusion_matrix
import helper_functions as hf

In [21]:
print(confusion_matrix(val_labels,pred0))
print(classification_report(val_labels,pred0))

[[386  28]
 [130 218]]
              precision    recall  f1-score   support

           0       0.75      0.93      0.83       414
           1       0.89      0.63      0.73       348

    accuracy                           0.79       762
   macro avg       0.82      0.78      0.78       762
weighted avg       0.81      0.79      0.79       762



In [22]:
hf.calculate_results(y_true=val_labels,y_pred=pred0)

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [None]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='input_layer')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid', name='output_layer')(x)

model1 = tf.keras.Model(inputs, outputs)

In [None]:
model1.summary()

In [None]:
model1.compile(loss='binary_crossentropy', 
               optimizer='Adam',
               metrics=['accuracy'])

In [None]:
history1 = model1.fit(x=train_sentences,y=train_labels,epochs=5,
                      validation_data=(val_sentences,val_labels),
                      callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                                experiment_name='nlp_model1')])

In [None]:
model1.evaluate(val_sentences, val_labels)

In [None]:
model1_prob_pred = model1.predict(val_sentences)

In [None]:
model1_preds = tf.squeeze(tf.round(model1_prob_pred))
model1_preds

In [None]:
model1_results = hf.calculate_results(val_labels, model1_preds)
model1_results

In [None]:
model1.summary()

In [None]:
embed_weights = model1.get_layer('embedding').get_weights()[0]
embed_weights.shape

In [None]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

# RNN 

### model2

In [None]:
tf.random.set_seed(42)
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(units=64)(x)
#x = tf.keras.layers.Dense(units=64, activation='relu')(x) can add
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model2 = tf.keras.Model(inputs, outputs, name='LSTM')

In [None]:
model2.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

In [None]:
history2 = model2.fit(x=train_sentences, y=train_labels, epochs=5,
                      validation_data=(val_sentences, val_labels),
                      callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                     experiment_name='nlp_model2')])

In [None]:
model2.evaluate(val_sentences,val_labels)

In [None]:
model2_pred_prob = model2.predict(val_sentences)

In [None]:
model2_pred = tf.squeeze(tf.round(model2_pred_prob))

In [None]:
model2_results = hf.calculate_results(val_labels,model2_pred)
model2_results

In [None]:
tf.random.set_seed(42)
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(units=64)(x)
x = tf.keras.layers.Dense(units=64, activation='relu')(x) 
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model2_test = tf.keras.Model(inputs, outputs, name='LSTM')

model2_test.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

hist2_test = model2.fit(x=train_sentences, y=train_labels, epochs=5,
                        validation_data=(val_sentences, val_labels),
                        callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                     experiment_name='nlp_model2_test')])

In [None]:
model2_test.evaluate(val_sentences, val_labels)

### GRU

In [None]:
tf.random.set_seed(42)
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(units=64,return_sequences=True)(x)
x = tf.keras.layers.GRU(64)(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model3 = tf.keras.Model(inputs, outputs, name='GRU')

In [None]:
model3.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

hist3 = model3.fit(x=train_sentences, y=train_labels, epochs=5,
                        validation_data=(val_sentences, val_labels),
                        callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                     experiment_name='nlp_model3')])

In [None]:
model3_pred_probs = model3.predict(val_sentences)
model3_pred = tf.squeeze(tf.round(model3_pred_probs))

In [None]:
hf.calculate_results(val_labels, model3_pred)

### Bidirectional

In [None]:
tf.random.set_seed(42)

inputs = tf.keras.layers.Input(shape=(1,), name='input_shape', dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=64))(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid', name='output_layer')(x)
model4 = tf.keras.Model(inputs, outputs, name='bidirectional')

In [None]:
model4.summary()

In [None]:
model4.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

In [None]:
hist4 = model4.fit(train_sentences, train_labels,
           epochs=5, validation_data=(val_sentences, val_labels),
           callbacks=[hf.create_tensorboard_callback(dir_name='logs', experiment_name='bidirectional_nlp')])

In [None]:
model4.evaluate(val_sentences, val_labels)

In [None]:
model4_probs = model4.predict(val_sentences)

In [None]:
model4_probs

In [None]:
model4_preds = tf.squeeze(tf.round(model4_probs))

In [None]:
model4_preds

In [None]:
hf.calculate_results(y_true=val_labels, y_pred=model4_preds)

In [None]:
tf.random.set_seed(42)

inputs = tf.keras.layers.Input(shape=(1,), name='input_layer', dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu', strides=1, padding='valid')(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
outputs = tf.keras.layers.Dense(units=1, activation='sigmoid', name='output_layer')(x)
model5 = tf.keras.Model(inputs,outputs)


In [None]:
model5.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])


In [None]:
model5.summary()

In [None]:
hist5 = model5.fit(train_sentences,
                   train_labels,
                   epochs=5,
                   validation_data=(val_sentences, val_labels),
                   callbacks=[hf.create_tensorboard_callback(dir_name='logs',experiment_name='nlp_conv')])

In [None]:
model5.evaluate(val_sentences, val_labels)

In [None]:
model5_probs = model5.predict(val_sentences)

In [None]:
model5_preds = tf.squeeze(tf.round(model5_probs))

In [None]:
model5_preds

In [None]:
hf.calculate_results(y_pred=model5_preds,y_true=val_labels)

### Transfer Learning Models

In [27]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential

In [29]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
sentence_encoder_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')

In [None]:
model6 = Sequential()
model6.add(sentence_encoder_layer)
model6.add(Dense(units=64, activation='relu'))
model6.add(Dense(units=1, activation='sigmoid'))

In [None]:
model6.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

In [None]:
model6.summary()

In [None]:
tf.function(jit_compile=True)

In [None]:
hist6 = model6.fit(train_sentences,
                   train_labels,
                   epochs=5,
                   validation_data=(val_sentences, val_labels),
                   callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                             experiment_name='nse_nlp')])

### using 10 percent data

In [23]:
train10_per = train_df_shuffled[['text','target']].sample(frac=0.1, random_state=42)
train_sentences10_per = train10_per['text']
train_label10_per = train10_per['target']

In [24]:
len(train_sentences10_per), len(train_label10_per)

(761, 761)

In [30]:
model7 = Sequential()
sentence_encoder_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')
model7.add(sentence_encoder_layer)
model7.add(Dense(units=64, activation='relu'))
model7.add(Dense(units=1, activation='sigmoid'))

In [31]:
model7.compile(loss='binary_crossentropy',
               optimizer='Adam',
               metrics=['accuracy'])

In [32]:
hist7 = model7.fit(train_sentences10_per, 
                   train_label10_per,
                   epochs=5,
                   validation_data=(val_sentences, val_labels),
                   callbacks=[hf.create_tensorboard_callback(dir_name='logs',
                                                             experiment_name='10percent data')])

Saving TensorBoard log files to: logs/10percent data/20220806-172629
Epoch 1/5


UnknownError: Graph execution error:

JIT compilation failed.
	 [[{{node EncoderDNN/EmbeddingLookup/EmbeddingLookupUnique/embedding_lookup/mod}}]] [Op:__inference_train_function_14375]