In [None]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import pandas as pd

# Load English language model
nlp = spacy.load("en_core_web_sm")


df = pd.read_csv("tweets.csv")

# Function to extract features from text
def extract_features(text):
    doc = nlp(text)
    # Extract verbs, their lemmas, and corresponding dependency tags
    features = " ".join([token.lemma_ for token in doc if token.pos_ == "VERB"])
    return features

# Preprocess the text column in your dataset
df['processed_text'] = df['text'].apply(lambda x: extract_features(x))

# Extract features and labels from your disaster dataset
X_train = df['processed_text']
y_event_type = df['keyword']

# Create a pipeline with TF-IDF vectorizer and SVM classifier for event types
pipeline_event_type = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer=lambda x: x)),
    ("svm", SVC(kernel="linear")),
])

# Train the event type classifier
pipeline_event_type.fit(X_train, y_event_type)

# Function to predict event type in text
def predict_event_type(text):
    features = extract_features(text)
    # Predict event type using the trained model
    predicted_event_type = pipeline_event_type.predict([features])[0]
    return predicted_event_type

# Example text
text = "There is a wildfire outbreak in California."

# Predict event type in the example text
predicted_event_type = predict_event_type(text)

# Print the predicted event type
print("Predicted Event Type:", predicted_event_type)


Predicted Event Type: collision


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load English language model
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("tweets.csv")

# Function to extract features from text using spaCy
def extract_features(text):
    doc = nlp(text)
    # Extract verbs, their lemmas, and corresponding dependency tags
    features = " ".join([token.lemma_ for token in doc if token.pos_ == "VERB"])
    return features

# Preprocess the text column in your dataset
df['processed_text'] = df['text'].apply(lambda x: extract_features(x))

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['keyword'], test_size=0.2, random_state=42)

# SVM model
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer=lambda x: x)),
    ("svm", SVC(kernel="linear")),
])

svm_pipeline.fit(X_train, y_train)

# Evaluate SVM model
svm_predictions = svm_pipeline.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

# LSTM model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max([len(sequence) for sequence in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    LSTM(units=64),
    Dense(units=len(df['keyword'].unique()), activation='softmax')
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate LSTM model
lstm_predictions = np.argmax(lstm_model.predict(X_test_padded), axis=-1)
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_predictions))


SVM Classification Report:
                       precision    recall  f1-score   support

               ablaze       0.80      0.44      0.57         9
             accident       0.00      0.00      0.00         7
           aftershock       0.29      0.13      0.18        15
  airplane%20accident       0.06      0.30      0.10        10
            ambulance       0.00      0.00      0.00         6
          annihilated       0.25      0.12      0.17         8
         annihilation       0.00      0.00      0.00        14
           apocalypse       0.00      0.00      0.00         6
           armageddon       0.00      0.00      0.00         5
                 army       0.00      0.00      0.00        17
                arson       0.00      0.00      0.00        12
             arsonist       0.00      0.00      0.00         2
               attack       0.00      0.00      0.00        19
             attacked       0.21      0.64      0.32        11
            avalanche      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10


UnimplementedError: Graph execution error:

Detected at node Cast_1 defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-2-a93049e0007e>", line 64, in <cell line: 64>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1155, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1249, in compute_metrics

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 620, in update_state

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/metrics_utils.py", line 77, in decorated

  File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 140, in update_state_fn

  File "/usr/local/lib/python3.10/dist-packages/keras/src/metrics/base_metric.py", line 708, in update_state

Cast string to float is not supported
	 [[{{node Cast_1}}]] [Op:__inference_train_function_3155]

In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

# Load English language model
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("tweets.csv")

# Function to preprocess and extract features from text
def preprocess_text(text):
    doc = nlp(text)
    # Extract verbs, their lemmas, and corresponding dependency tags
    features = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return " ".join(features)

# Preprocess the text column in your dataset
df['processed_text'] = df['text'].apply(preprocess_text)

# Extract features and labels from your disaster dataset
X = df['processed_text']
y = df['keyword']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text
max_words = 1000
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

# Define CNN model
embedding_dim = 50
num_filters = 64
kernel_size = 5

cnn_model = Sequential()
cnn_model.add(Embedding(max_words, embedding_dim, input_length=max_len))
cnn_model.add(Conv1D(num_filters, kernel_size, activation='relu'))
cnn_model.add(MaxPooling1D())
cnn_model.add(Flatten())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer

# Compile CNN model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train CNN model
cnn_model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_padded, y_test)

# Print CNN accuracy
print("CNN Accuracy:", cnn_accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CNN Accuracy: 0.10466139018535614


In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Load English language model
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("tweets.csv")

# Function to preprocess and extract features from text
def preprocess_text(text):
    doc = nlp(text)
    # Extract verbs, their lemmas, and corresponding dependency tags
    features = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return " ".join(features)

# Preprocess the text column in your dataset
df['processed_text'] = df['text'].apply(preprocess_text)

# Extract features and labels from your disaster dataset
X = df['processed_text']
y = df['keyword']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text
max_words = 10000  # Increase max_words
max_len = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

# Define CNN model
embedding_dim = 100  # Increase embedding_dim
num_filters = 128  # Increase num_filters
kernel_size = 5

cnn_model = Sequential()
cnn_model.add(Embedding(max_words, embedding_dim, input_length=max_len))
cnn_model.add(Conv1D(num_filters, kernel_size, activation='relu'))
cnn_model.add(MaxPooling1D())
cnn_model.add(Conv1D(num_filters, kernel_size, activation='relu'))  # Add another convolutional layer
cnn_model.add(MaxPooling1D())
cnn_model.add(Flatten())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))  # Add dropout layer for regularization
cnn_model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer

# Compile CNN model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train CNN model
cnn_model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_padded, y_test)

# Print CNN accuracy
print("CNN Accuracy:", cnn_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CNN Accuracy: 0.010554090142250061


In [None]:
!pip install simplet5




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("tweets.csv", usecols = ["keyword","text"])

In [None]:
df.head()

Unnamed: 0,keyword,text
0,ablaze,"Communal violence in Bhainsa, Telangana. ""Ston..."
1,ablaze,Telangana: Section 144 has been imposed in Bha...
2,ablaze,Arsonist sets cars ablaze at dealership https:...
3,ablaze,Arsonist sets cars ablaze at dealership https:...
4,ablaze,"""Lord Jesus, your love brings freedom and pard..."


In [None]:
df= df.rename(columns={"keyword":"target_text", "text":"source_text"})
df = df[['source_text','target_text']]

In [None]:
df.head()

Unnamed: 0,source_text,target_text
0,"Communal violence in Bhainsa, Telangana. ""Ston...",ablaze
1,Telangana: Section 144 has been imposed in Bha...,ablaze
2,Arsonist sets cars ablaze at dealership https:...,ablaze
3,Arsonist sets cars ablaze at dealership https:...,ablaze
4,"""Lord Jesus, your love brings freedom and pard...",ablaze


In [None]:
df["source_text"] = "event_type : " + df["source_text"]

In [None]:
df.head()

Unnamed: 0,source_text,target_text
0,"event_type : Communal violence in Bhainsa, Tel...",ablaze
1,event_type : Telangana: Section 144 has been i...,ablaze
2,event_type : Arsonist sets cars ablaze at deal...,ablaze
3,event_type : Arsonist sets cars ablaze at deal...,ablaze
4,"event_type : ""Lord Jesus, your love brings fre...",ablaze


In [None]:
from sklearn.model_selection import train_test_split
train_df , test_df = train_test_split(df,train_size = 0.2)



In [None]:
train_df

Unnamed: 0,source_text,target_text
3619,event_type : Point about historic figure/monum...,derail
4088,event_type : Urgent! GARN are devastated to le...,devastated
4042,event_type : Largest bombs can directly be con...,detonate
7024,"event_type : the info, the thoughts, suffering...",inundation
9383,"event_type : ""Yes, the smoke is a problem but ...",smoke
...,...,...
9727,event_type : The latest in chic suicide bombin...,suicide%20bombing
1974,event_type : KANGAROO ISLAND’S UNIQUE WILDLIFE...,bush%20fires
758,event_type : 12F22C99 :Battle ID I need backup...,battle
7761,"event_type : Seeing this Warren cause evolve, ...",obliterate


In [None]:
# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 and CodeT5 models)
model.from_pretrained("t5","t5-small")

# train
model.train(train_df=train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=test_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512,
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 5,
            use_gpu = True,
            outputdir = "outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )




INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Downloading:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 and CodeT5 models)
model.from_pretrained("t5","t5-small")


In [None]:
# load trained T5 model
model.load_model("t5","/content/drive/MyDrive/NLP models/5_epoch_run", use_gpu=False)

# predict
model.predict("event_type : Peninsular river basins in India are more likely to face widespread flooding than transboundary rivers.")

['flooding']

In [None]:
# predict
model.predict("event_type : In the first months of 2022, extreme heat and drought caused widespread wildfires in Corrientes province, northeastern Argentina. More than 520,000 hectares have burned, killing countless animals and destroying crops and pastures.")

['wildfire']

In [None]:
# predict
model.predict("event_type : On November 21, 2022, an earthquake measuring 6.5 magnitudes hit the Cianjur District and much of West Java Province. The hundreds of aftershocks hampered recovery after this natural disaster of 2022. Some people were buried in buildings that collapsed or were carried away by a landslide. Thus, the rescue operations shifted to search and recovery only after a few days.")

['aftershocks']

In [None]:
# predict
model.predict("event_type : Most of the United States experiences tornadoes from April to June. However, several severe storm systems had already impacted the southern United States before April, causing a series of natural disasters in March 2022.")

['tornadoes']

In [None]:
/content/outputs/simplet5-epoch-4-train-loss-0.9174-val-loss-0.9362