In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

# # importing libraries
import random
# import torch
# from transformers import BertTokenizer, BertModel
# from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.shape

(7613, 5)

In [None]:
train_df.groupby(['target']).agg({'id' : 'nunique'}).reset_index()

Unnamed: 0,target,id
0,0,4342
1,1,3271


In [None]:
# Do train and validation split (80-20 split)
# n = round(len(train_df)*0.2)
# valdn_split_df = train_df.sample(n, replace = False)
# train_split_df = train_df[~(train_df.index.isin(valdn_split_df.index))]

# train_split_df.shape, valdn_split_df.shape

In [None]:
# Create new text column, adding keyword and location

train_df.keyword = train_df.keyword.fillna('')
train_df.location = train_df.location.fillna('')

train_df['text_conc'] = train_df.text + " " + train_df.keyword + " " + train_df.location


In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,text_conc
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


In [None]:
# Create Tokens

training_tweets = np.array(train_df.text_conc)
# valdn_tweets = np.array(valdn_split_df.text)

tokenizer = Tokenizer(num_words=2000)

tokenizer.fit_on_texts(training_tweets)
training_tweets_token = tokenizer.texts_to_sequences(training_tweets)

# tokenizer.fit_on_texts(valdn_tweets)
# valdn_tweets_token = tokenizer.texts_to_sequences(valdn_tweets)

# len(training_tweets_token[0]), len(valdn_tweets_token[1])

In [None]:
input_dimension_size = max(len(seqn) for seqn in training_tweets_token)
embedding_input_dimension = len(tokenizer.word_index)

In [None]:
embedding_input_dimension

24590

In [None]:
# # Do Padding

# Padding
X_train = sequence.pad_sequences(training_tweets_token, maxlen=input_dimension_size)


In [None]:
input_dimension_size

32

In [None]:
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action='ignore')


# Create CBOW model
model1 = gensim.models.Word2Vec(training_tweets_token, min_count=1,
                                vector_size=32, window=5)

# # Print results
# print("Cosine similarity between 'alice' " +
#       "and 'wonderland' - CBOW : ",
#       model1.wv.similarity('alice', 'wonderland'))

# print("Cosine similarity between 'alice' " +
#       "and 'machines' - CBOW : ",
#       model1.wv.similarity('alice', 'machines'))

# Create Skip Gram model
model2 = gensim.models.Word2Vec(training_tweets_token, min_count=1, vector_size=32,
                                window=5, sg=1)

In [None]:
model1.wv[training_tweets_token[3]].shape

(7, 32)

In [None]:
len(tokenizer.word_index)

24590

In [None]:
len(embedding_nd_array)

32

In [None]:
# Create embedding matrix
embedding_dim = 32
embedding_matrix = np.zeros((2000, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < 2000:
        try:
            embedding_matrix[i] = model2.wv[word]
        except KeyError:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random initialization for OOV words


In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.55828361, -0.78089174, -0.51776418, ..., -0.79074933,
        -1.02522311, -0.01200475],
       [-0.26703613,  1.12721056,  0.54207351, ...,  1.04150358,
         0.56214234, -0.66929644],
       ...,
       [ 0.17948644, -0.72791895, -0.44754609, ..., -0.21023923,
        -0.87323327, -0.20180429],
       [ 0.26004173, -1.29245932,  0.89846398, ..., -0.03782136,
         0.1691328 , -0.3492742 ],
       [ 0.11799986, -0.47488726,  0.22269311, ..., -0.10485256,
         0.33476679, -1.32304453]])

In [None]:
# Build model 2 - Bidirectional stacked LSTM model

# Build model 1 - let's  start with a stacked LSTM model

model_bid_gru = keras.Sequential()
model_bid_gru.add(layers.Input((input_dimension_size,)))
model_bid_gru.add(layers.Embedding(2000, 32, weights=[embedding_matrix]))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3)))
model_bid_gru.add(layers.Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.001)

model_bid_gru.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model_bid_gru.summary()


In [None]:
# Fit model

y_train = np.array(train_df.target)

EPOCHS = 32
BATCH = 1024

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)


model_bid_gru.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=EPOCHS,
          validation_split=0.2,
          verbose = 1,
          callbacks = [early_stop])

Epoch 1/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 6s/step - accuracy: 0.5653 - loss: 0.6788 - val_accuracy: 0.6159 - val_loss: 0.6578
Epoch 2/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 5s/step - accuracy: 0.6337 - loss: 0.6450 - val_accuracy: 0.6290 - val_loss: 0.6514
Epoch 3/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5s/step - accuracy: 0.6474 - loss: 0.6305 - val_accuracy: 0.6395 - val_loss: 0.6443
Epoch 4/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5s/step - accuracy: 0.6669 - loss: 0.6109 - val_accuracy: 0.6724 - val_loss: 0.6307
Epoch 5/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5s/step - accuracy: 0.6783 - loss: 0.5920 - val_accuracy: 0.6612 - val_loss: 0.6214
Epoch 6/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5s/step - accuracy: 0.6955 - loss: 0.5745 - val_accuracy: 0.6756 - val_loss: 0.6114
Epoch 7/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c5d85259a50>

In [None]:
len(X_train)

7613

In [None]:
# Build model 2 - Bidirectional stacked LSTM model

# Build model 1 - let's  start with a stacked LSTM model

model_bid_gru = keras.Sequential()
model_bid_gru.add(layers.Input((input_dimension_size,)))
model_bid_gru.add(layers.Embedding(embedding_input_dimension, 32))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_bid_gru.add(layers.Bidirectional(layers.GRU(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3)))
model_bid_gru.add(layers.Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.001)

model_bid_gru.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model_bid_gru.summary()


In [None]:
# Fit model

y_train = np.array(train_df.target)

EPOCHS = 32
BATCH = 1024

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)


model_bid_gru.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=EPOCHS,
          validation_split=0.2,
          verbose = 1,
          callbacks = [early_stop])

Epoch 1/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 6s/step - accuracy: 0.5281 - loss: 0.6863 - val_accuracy: 0.5345 - val_loss: 0.6848
Epoch 2/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 5s/step - accuracy: 0.6003 - loss: 0.6670 - val_accuracy: 0.6034 - val_loss: 0.6350
Epoch 3/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5s/step - accuracy: 0.6793 - loss: 0.5894 - val_accuracy: 0.7334 - val_loss: 0.5571
Epoch 4/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 5s/step - accuracy: 0.7740 - loss: 0.4922 - val_accuracy: 0.7590 - val_loss: 0.5166
Epoch 5/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 5s/step - accuracy: 0.8176 - loss: 0.4200 - val_accuracy: 0.7656 - val_loss: 0.4947
Epoch 6/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 5s/step - accuracy: 0.8266 - loss: 0.3984 - val_accuracy: 0.7676 - val_loss: 0.4882
Epoch 7/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c5d6296b550>

In [None]:
# Build model 3 - GRU model

# Build model 1 - let's  start with a stacked LSTM model

model_gru = keras.Sequential()
model_gru.add(layers.Input((input_dimension_size,)))
# model_gru.add(layers.Embedding(embedding_input_dimension, 32))
model_gru.add(layers.Embedding(2000, 32, weights=[embedding_matrix]))
model_gru.add(layers.GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model_gru.add(layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model_gru.add(layers.GRU(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3))
model_gru.add(layers.Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.001)

model_gru.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model_gru.summary()


In [None]:
# Fit model

y_train = np.array(train_df.target)

EPOCHS = 32
BATCH = 1024

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)


model_gru.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=EPOCHS,
          validation_split=0.2,
          verbose = 1,
          callbacks = [early_stop])

Epoch 1/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 4s/step - accuracy: 0.5574 - loss: 0.6841 - val_accuracy: 0.6297 - val_loss: 0.6492
Epoch 2/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3s/step - accuracy: 0.6302 - loss: 0.6465 - val_accuracy: 0.6546 - val_loss: 0.6327
Epoch 3/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step - accuracy: 0.6514 - loss: 0.6342 - val_accuracy: 0.6415 - val_loss: 0.6305
Epoch 4/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step - accuracy: 0.6727 - loss: 0.6159 - val_accuracy: 0.6743 - val_loss: 0.6107
Epoch 5/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.6823 - loss: 0.6006 - val_accuracy: 0.6796 - val_loss: 0.6040
Epoch 6/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step - accuracy: 0.6977 - loss: 0.5816 - val_accuracy: 0.6986 - val_loss: 0.5920
Epoch 7/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c5d83ed4d10>

In [None]:
# Build model 2 - Bidirectional stacked LSTM model

# Build model 1 - let's  start with a stacked LSTM model

model_bid = keras.Sequential()
model_bid.add(layers.Input((input_dimension_size,)))
# model_bid.add(layers.Embedding(embedding_input_dimension, 32))
model_bid.add(layers.Embedding(2000, 32, weights=[embedding_matrix]))
model_bid.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model_bid.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model_bid.add(layers.Bidirectional(layers.LSTM(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3)))
model_bid.add(layers.Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.001)

model_bid.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model_bid.summary()


In [None]:
# Fit model

y_train = np.array(train_df.target)

EPOCHS = 32
BATCH = 1024

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)


model_bid.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=EPOCHS,
          validation_split=0.2,
          verbose = 1,
          callbacks = [early_stop])

Epoch 1/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 7s/step - accuracy: 0.5446 - loss: 0.6813 - val_accuracy: 0.6605 - val_loss: 0.6275
Epoch 2/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 6s/step - accuracy: 0.6476 - loss: 0.6353 - val_accuracy: 0.6750 - val_loss: 0.6120
Epoch 3/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 6s/step - accuracy: 0.6840 - loss: 0.6032 - val_accuracy: 0.6855 - val_loss: 0.5960
Epoch 4/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 7s/step - accuracy: 0.6920 - loss: 0.5882 - val_accuracy: 0.6967 - val_loss: 0.5895
Epoch 5/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 6s/step - accuracy: 0.7029 - loss: 0.5686 - val_accuracy: 0.7006 - val_loss: 0.5854
Epoch 6/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 6s/step - accuracy: 0.7232 - loss: 0.5455 - val_accuracy: 0.7104 - val_loss: 0.5751
Epoch 7/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c5d74284d10>

In [None]:
# Build model 1 - let's  start with a stacked LSTM model

model = keras.Sequential()
model.add(layers.Input((input_dimension_size,)))
# model.add(layers.Embedding(embedding_input_dimension, 32))
model.add(layers.Embedding(2000, 32, weights=[embedding_matrix]))
model.add(layers.LSTM(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(layers.LSTM(128, return_sequences=False, dropout=0.3, recurrent_dropout=0.3))
# model.add(layers.LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
# model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.001)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary()


In [None]:
# Fit model

EPOCHS = 32
BATCH = 1024

y_train = np.array(train_df.target)

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)


model.fit(X_train, y_train,
          batch_size=BATCH,
          epochs=EPOCHS,
          validation_split=0.2,
          verbose = 1,
          callbacks = [early_stop])

Epoch 1/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3s/step - accuracy: 0.5686 - loss: 0.6845 - val_accuracy: 0.5502 - val_loss: 0.6735
Epoch 2/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2s/step - accuracy: 0.6219 - loss: 0.6488 - val_accuracy: 0.6638 - val_loss: 0.6175
Epoch 3/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.6581 - loss: 0.6206 - val_accuracy: 0.6743 - val_loss: 0.6031
Epoch 4/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.6735 - loss: 0.6049 - val_accuracy: 0.6914 - val_loss: 0.5978
Epoch 5/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.7006 - loss: 0.5848 - val_accuracy: 0.7026 - val_loss: 0.5831
Epoch 6/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2s/step - accuracy: 0.6902 - loss: 0.5848 - val_accuracy: 0.6888 - val_loss: 0.5840
Epoch 7/32
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c5d62cde650>