In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
#from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
#STOPWORDS = set(stopwords.words('english'))

In [3]:
df = pd.read_csv('/content/drive/MyDrive/disaster_tweets.csv')

In [4]:
df.head()

Unnamed: 0,tweets,type_of_disaster,relevance,hashtags
0,nohfoynews tanker helicopter heads up to parad...,wildfires,1.0,['highparkfire']
1,evacuation center cache la poudre middle schoo...,wildfires,1.0,"['evacuation', 'colorado', 'wildfire']"
2,f degrees cooler tomorrow in noh central amp n...,wildfires,1.0,"['colorado', 'highparkfire', 'cowx', 'heat', '..."
3,fema has authorized the use of federal funds t...,wildfires,1.0,['highparkfire']
4,media large wildfire in n colorado prompts eva...,wildfires,1.0,"['media', 'politics', 'news']"


In [None]:
df.columns

Index(['tweets', 'type_of_disaster', 'relevance', 'hashtags'], dtype='object')

In [None]:
df.shape

(217378, 4)

In [None]:

df.dtypes

tweets               object
type_of_disaster     object
relevance           float64
hashtags             object
dtype: object

In [None]:
df.type_of_disaster.value_counts()

not disaster    75075
hurricane       49849
earthquake      22456
flood           18321
wildfires       15391
tornado         13838
storm            9649
floods           9174
meteor           1835
haze             1597
pandemic          146
volcano            47
Name: type_of_disaster, dtype: int64

In [13]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each tweet.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['tweets'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 136585 unique tokens.


In [14]:
X = tokenizer.texts_to_sequences(df['tweets'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (217378, 250)


In [15]:
Y = pd.get_dummies(df['type_of_disaster']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (217378, 12)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(195640, 250) (195640, 12)
(21738, 250) (21738, 12)


In [9]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(12, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (Spatial  (None, 250, 100)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 12)                1212      
                                                                 
Total params: 5081612 (19.38 MB)
Trainable params: 5081612 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(np.argmax(Y, axis=1)), y=np.argmax(Y, axis=1))
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}



In [None]:
import os
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback,ReduceLROnPlateau
import json
import tensorflow as tf

In [None]:


# Define the checkpoint directory on your Google Drive
checkpoint_dir = "/content/drive/My Drive/checkpoint_folder/"

# Create a directory if it doesn't exist
os.makedirs(checkpoint_dir, exist_ok=True)

# Define a custom callback to save the history during training
'''class CustomCallback(Callback):
    def __init__(self):
        self.history = []

    def on_epoch_end(self, epoch, logs=None):
        self.history.append(logs)'''

class CustomModelCheckPoint(tf.keras.callbacks.Callback):
    def __init__(self,**kargs):
        super(CustomModelCheckPoint,self).__init__(**kargs)
        self.epoch_accuracy = {} # loss at given epoch
        self.epoch_loss = {} # accuracy at given epoch

    def on_epoch_begin(self,epoch, logs={}):
        # Things done on beginning of epoch.
        return

    def on_epoch_end(self, epoch, logs={}):
        # things done on end of the epoch
        self.epoch_accuracy[epoch] = logs.get("acc")
        self.epoch_loss[epoch] = logs.get("loss")
        self.model.save_weights("name-of-model-%d.h5" %epoch)

# Define other callbacks
checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, "model_checkpoint_epoch{:02d}.h5"),  # Corrected formatting
    save_best_only=True,
    monitor='val_loss',
    mode='auto',
    save_weights_only=False
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,  # Change patience to 2 as you mentioned
    min_delta=0.0001
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss',
    factor = 0.2,
    verbose = 1,
    patience = 5,
    min_lr = 0.001
)

# Create an instance of the custom callback
custom_callback = CustomModelCheckPoint()

# Training parameters
epochs = 10
batch_size = 128


# Check for existing checkpoints in the checkpoint directory
existing_checkpoints = [
    f for f in os.listdir(checkpoint_dir) if f.startswith("model_checkpoint_epoch")
]

# If checkpoints exist, find the latest one
if existing_checkpoints:
    latest_checkpoint = max(existing_checkpoints)
    print(f"Loading the latest checkpoint: {latest_checkpoint}")
    model = tf.keras.models.load_model(os.path.join(checkpoint_dir, latest_checkpoint))

# Load the training history if it exists
'''history_filename = os.path.join(checkpoint_dir, "training_history.json")
loaded_history = {}
if os.path.exists(history_filename):
    with open(history_filename, 'r',encoding='utf-8') as json_file:
        loaded_history = json.load(json_file)'''
# Load the accuracy history if it exists
accuracy_history_filename = os.path.join(checkpoint_dir, "accuracy_history.json")
loaded_accuracy_history = {}
if os.path.exists(accuracy_history_filename):
    with open(accuracy_history_filename, 'r', encoding='utf-8') as json_file:
        loaded_accuracy_history = json.load(json_file)

# Load the loss history if it exists
loss_history_filename = os.path.join(checkpoint_dir, "loss_history.json")
loaded_loss_history = {}
if os.path.exists(loss_history_filename):
    with open(loss_history_filename, 'r', encoding='utf-8') as json_file:
        loaded_loss_history = json.load(json_file)


# Training loop for epochs
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training the model for one epoch
    history = model.fit(
        X_train,
        Y_train,
        epochs=1,  # Train for one epoch at a time
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[custom_callback,reduce_lr,early_stopping_callback],
        class_weight=class_weight_dict
    )

    # Append the newly trained history to the loaded history
    '''for key in history.history.keys():
        if key in loaded_history:
            loaded_history[key] += history.history[key]
        else:
            loaded_history[key] = history.history[key]

    # Save the updated training history as a JSON file
    with open(history_filename, 'w') as json_file:
      json.dump(str(loaded_history), json_file)

    # Save a checkpoint after each epoch'''
    with open(os.path.join(checkpoint_dir, "accuracy_history.json"), 'w') as json_file:
      json.dump(custom_callback.epoch_accuracy, json_file)

    with open(os.path.join(checkpoint_dir, "loss_history.json"), 'w') as json_file:
      json.dump(custom_callback.epoch_loss, json_file)
    model.save(os.path.join(checkpoint_dir, f"model_checkpoint_epoch{epoch + 1:02d}.h5"))




Epoch 1/10



You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
  88/1376 [>.............................] - ETA: 22:36 - loss: 0.0756 - accuracy: 0.9620

KeyboardInterrupt: ignored

In [None]:


accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.258
  Accuracy: 0.923


In [None]:
accr = new_model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1])) #this performs better for both val and test

Test set
  Loss: 0.268
  Accuracy: 0.921


In [None]:
model.save('/content/drive/MyDrive/disaster_model2.h5')

In [None]:
import tensorflow
from tensorflow.keras.models import load_model
new_model = tensorflow.keras.models.load_model('/content/drive/MyDrive/checkpoint_folder/model_checkpoint_epoch02.h5')



In [31]:
unique_class_labels = df['type_of_disaster'].unique().tolist()
print(unique_class_labels)


['wildfires', 'earthquake', 'floods', 'storm', 'meteor', 'haze', 'tornado', 'flood', 'volcano', 'pandemic', 'hurricane', 'not disaster']


In [32]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Load the model
new_model = tf.keras.models.load_model('/content/drive/MyDrive/checkpoint_folder/model_checkpoint_epoch02.h5')

# Ensure tokenizer and MAX_SEQUENCE_LENGTH are defined
# tokenizer = ...
# MAX_SEQUENCE_LENGTH = ...

tweet = 'The COVID-19 pandemic, also known as the coronavirus pandemic, is a global health crisis caused by the novel coronavirus SARS-CoV-2.'

# Text preprocessing
tweet = re.sub(r'<USER>|<URL>|<HASHTAG>', '', tweet)
tweet = re.sub(r'@[\w_]+', '', tweet)
url_pattern = r'https?://\S+|www\.\S+'
cleaned_tweet = re.sub(url_pattern, '', tweet)
tweet = re.sub('[^A-Za-z]+', ' ', tweet)

# Convert the processed tweet into a list
tweets_list = [tweet]

# Tokenize and pad the input
seq = tokenizer.texts_to_sequences(tweets_list)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)

# Make predictions
pred = new_model.predict(padded)

# Assuming you have a list of unique class labels
# unique_class_labels = ...

# Print the model's output
print("Model Output (Probabilities for Each Class):")
print(pred)

# Get the predicted class index
predicted_class_index = np.argmax(pred)

# Debugging: Print the predicted class index
print("Predicted Class Index:", predicted_class_index)

# Ensure the predicted class index is within a valid range
if predicted_class_index < len(unique_class_labels):
    predicted_class_label = unique_class_labels[predicted_class_index]
    print("Predicted Class Label:", predicted_class_label)
else:
    print("Invalid Predicted Class Index:", predicted_class_index)




Model Output (Probabilities for Each Class):
[[1.3084675e-04 4.5478565e-04 1.9734350e-04 8.7405788e-05 2.4461201e-03
  4.5152632e-05 4.2304946e-05 2.9965417e-05 1.3819360e-03 9.9466574e-01
  5.1270599e-06 5.1328161e-04]]
Predicted Class Index: 9
Predicted Class Label: pandemic
