# Disaster Tweets Classification Model (Kaggle Competition)

## 1. Load Libraries and Data

In [23]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import nltk
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
dir=''

if os.getcwd() == '/content':
  dir = '/content/drive/Othercomputers/My PC/Projects/Kaggle/Disaster_Tweets_NLP/Datasets/'
else:
  dir = 'Datasets/'  

In [9]:
trainDf = pd.read_csv(dir + 'train.csv', sep=',')
testDf = pd.read_csv(dir + 'test.csv', sep=',')

## 2. Extract Features

In [11]:
trainDf['keyword_flag'] = 0
trainDf['keyword_flag'] = trainDf['keyword'].isnull()
trainDf['location_flag'] = 0
trainDf['location_flag'] = trainDf['location'].isnull()

In [12]:
trainDf['text_len'] = 0
trainDf['text_len'] = trainDf['text'].str.split(' ').apply(len)

In [18]:
trainDf['mention'] = 0
trainDf['hashtag'] = 0
trainDf['mention'] = trainDf['text'].str.count('@')
trainDf['hashtag'] = trainDf['text'].str.count('#')

## 3. RNN Model

Using only the text inputs

In [21]:
X_input_raw = trainDf['text']
X_test_input_raw = testDf['text']

In [24]:
stop_words = list(stopwords.words(['english']))
lemmatizer = WordNetLemmatizer()

In [80]:
def custom_standardization(x):
    r='!"$%&\'()*+-./,:;<=>?[\\]^_`{|}~'
    x=x.lower()
    to_replace=[re.escape(i) for i in r]
    replace_with=[' '+i+' ' for i in r]
    x=pd.DataFrame([x])[0].replace(to_replace,replace_with,regex=True)[0]
    output = list(filter(("").__ne__, x.split(" ")))
    #output = list(filter((".").__ne__, output))
    #output = list(filter((",").__ne__, output))
    #output = list(filter(("0").__ne__, output))
    #output = list(filter(("1").__ne__, output))
    #output = list(filter(("2").__ne__, output))
    #output = list(filter(("4").__ne__, output))
    #output = list(filter(("5").__ne__, output))
    #output = list(filter(("6").__ne__, output))
    #output = list(filter(("7").__ne__, output))
    #output = list(filter(("8").__ne__, output))
    #output = list(filter(("9").__ne__, output))
    #output = list(filter(("@").__ne__, output))

    output = [lemmatizer.lemmatize(word) for word in output]
    filtered_sentence = (" ").join(output) 
    return filtered_sentence

In [81]:
X_input = X_input_raw.map(custom_standardization)
X_test_input = X_test_input_raw.map(custom_standardization)

In [82]:
X_input

0       our deed are the reason of this #earthquake ma...
1                 forest fire near la ronge sask . canada
2       all resident asked to ' shelter in place ' are...
3       13 , 000 people receive #wildfires evacuation ...
4       just got sent this photo from ruby #alaska a s...
                              ...                        
7608    two giant crane holding a bridge collapse into...
7609    @aria _ ahrary @thetawniest the out of control...
7610    m1 . 94 [ 01 : 04 utc ] ? 5km s of volcano haw...
7611    police investigating after an e - bike collide...
7612    the latest : more home razed by northern calif...
Name: text, Length: 7613, dtype: object

In [83]:
vectorize_layer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = None,
    standardize = None,
    split = 'whitespace',
    ngrams = None,
    output_mode = 'int',
    output_sequence_length = None
)

In [84]:
vectorize_layer.adapt(np.array(X_input.append(X_test_input)))

In [None]:
# check preprocessing results, such as vocabulary, 
vectorize_layer.get_vocabulary()

### Basic RNN

In [85]:
model_rnn = keras.Sequential()

model_rnn.add(vectorize_layer)

model_rnn.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 128,
    mask_zero = True,
))

model_rnn.add(keras.layers.SimpleRNN(64))

model_rnn.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [86]:
model_rnn.compile(loss = keras.losses.BinaryCrossentropy(),
                  optimizer='adam',
                  metrics=['accuracy'])

In [87]:
Y_lab = trainDf['target']

In [88]:
model_rnn.fit(x = np.array(X_input), y = Y_lab, validation_split = 0.1,
              epochs=5, batch_size = 128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4efb73c910>

In [61]:
pred_nn_val = model_rnn.predict(np.array(X_test_input))

In [62]:
submission = pd.DataFrame(testDf['id'])

In [63]:
submission['target'] = np.where(pred_nn_val > 0.5, 1,0)

In [64]:
submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [65]:
submission.to_csv('predictions_test002.csv', index=False)

### LSTM


In [130]:
model_lstm = keras.Sequential()

model_lstm.add(vectorize_layer)

model_lstm.add(keras.layers.Embedding(
    input_dim = len(vectorize_layer.get_vocabulary()),
    output_dim = 128,
    mask_zero = True
))

model_lstm.add(keras.layers.LSTM(128))
model_lstm.add(keras.layers.Dense(32,activation='relu'))

#model_lstm.add(keras.layers.Dense(8,activation='relu'))

model_lstm.add(keras.layers.Dense(1, activation = 'sigmoid'))

In [131]:
# configure training / optimization
model_lstm.compile(loss = keras.losses.BinaryCrossentropy(),
                   optimizer='adam',
                   metrics=['accuracy'])

In [132]:
model_lstm.fit(x = np.array(X_input), y = Y_lab, validation_split = 0.1,
               epochs=3, batch_size = 512)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4ee1f0e310>

In [133]:
pred_lstm_val = model_lstm.predict(np.array(X_test_input))

In [134]:
submission = pd.DataFrame(testDf['id'])
submission['target'] = np.where(pred_lstm_val > 0.5, 1,0)
submission.to_csv('predictions_test006.csv', index=False)