# Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Dropout,LSTM,SimpleRNN,Embedding,Bidirectional,LSTM,GlobalMaxPool1D
from keras.models import Sequential

# Import CSV files and clean missing value

In [2]:
data_train = pd.read_csv('C:/Users/yanli/Downloads/train.csv')
data_test = pd.read_csv('C:/Users/yanli/Downloads/test.csv')

In [3]:
data_train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
data_test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

From Line[3] and Line[4], we found out that there are missing values in both train and test datasets, which are in 'location' and 'keyword' columns.

Since these two features will not have influence on our prediction model, we will simply drop them. We will drop the feature 'id' too.

In [5]:
data_train = data_train.drop(['id','keyword','location'],axis = 1)

In [6]:
data_test = data_test.drop(['id','keyword','location'],axis = 1)

Train and test datasets after dropping columns

In [7]:
data_train.head(10)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
5,#RockyFire Update => California Hwy. 20 closed...,1
6,#flood #disaster Heavy rain causes flash flood...,1
7,I'm on top of the hill and I can see a fire in...,1
8,There's an emergency evacuation happening now ...,1
9,I'm afraid that the tornado is coming to our a...,1


In [8]:
data_test.head(10)

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
5,We're shaking...It's an earthquake
6,They'd probably still show more life than Arse...
7,Hey! How are you?
8,What a nice hat?
9,Fuck off!


# Design Natural Language Processing functions

## 1. Tokenizing the string
## 2. Converting characters to lowercase
## 3. Removing stop words and punctuations
## 4. Stemming or lemmatization

Import NLP Packages

In [10]:
import nltk
import re
import string
from nltk.tokenize import RegexpTokenizer,TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yanli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yanli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Function that remove unnecessary characters such as URL, http and emojis

In [11]:
def RemoveUnnecessaryChar(sentence):
    for sentence1 in sentence:
        sentence1 = str(sentence1)
        sentence1 = sentence1.lower()
        formatted_sent = re.sub(r'https?:\/\/.*[\r\n]*','',sentence1) #  Remove hyperlinks
        formatted_sent = formatted_sent.replace('{html}',"")
        formatted_sent = re.sub(r'#','',formatted_sent) # Removed Hashtags
        formatted_sent = re.sub(r'[0-9]','',formatted_sent) # Removes Numbers
        formatted_sent = re.sub(r'@[A-Za-z]*','',formatted_sent) # Removed @ Tags
        
        sent.append(formatted_sent)

### Function that tokenize sentences. The goal of this function is to sepearate the whole sentence apart and treat every word as an individual element

In [12]:
def TokenizeSentence(sentence):
    tokenizer = TweetTokenizer(preserve_case = False,strip_handles = True,reduce_len=True)
    for sentence in sent:
        tokenized_sentence = tokenizer.tokenize(sentence)
        tokenized_sent.append(tokenized_sentence)

### Function that find and delete stopwords

In [13]:
def stopwordsSentence(sent):
    for sentence in sent:
        formatted_words=[]
        for word in sentence:
            if word not in stopwords_eng and word not in string.punctuation and len(word)>2:
                formatted_words.append(word)
        formatted_sent.append(formatted_words)  

### Function that do lemmatization

In [14]:
def lemmatizeSentence(sent):
    lemma = WordNetLemmatizer()
    for sentence in sent:
            lemma_words = []
            for word in sentence:
                lemma_word = lemma.lemmatize(word)
                lemma_words.append(lemma_word)
            lemma_sent.append(lemma_words)

### Function that generate the final sentence for the model

In [15]:
def finalSentence(sentence1):
    for sentence in sentence1:
        sent = ' '.join([str(word) for word in sentence])
        final_sentence_list.append(sent)

### Start Sanitizing 'text' column in the train test

In [16]:
sent = []
RemoveUnnecessaryChar(data_train['text'])

Test first step: Unnecessary character removing

In [17]:
print(sent[1])

forest fire near la ronge sask. canada


In [18]:
tokenized_sent = []
TokenizeSentence(sent)

Test the second step: Tokenize sentences

In [19]:
tokenized_sent[1]

['forest', 'fire', 'near', 'la', 'ronge', 'sask', '.', 'canada']

Set stopwords and punctuations for the third step: Remove stop words and punctuations

In [20]:
stopwords_eng = stopwords.words('english')
print('English Stop Words :\n')
print(stopwords_eng)
print('\nPunctuations  :\n')
print(string.punctuation)

English Stop Words :

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', '

In [21]:
formatted_sent = []
stopwordsSentence(tokenized_sent)

Test the third step: Remove stop words and punctuations

In [22]:
formatted_sent[1]

['forest', 'fire', 'near', 'ronge', 'sask', 'canada']

In [23]:
lemma_sent = []
lemmatizeSentence(formatted_sent)

Test the fourth step: Lemmatization

In [24]:
lemma_sent[1]

['forest', 'fire', 'near', 'ronge', 'sask', 'canada']

Generate the final sentence

In [25]:
final_sentence_list = []
finalSentence(lemma_sent)

In [26]:
final_sentence_list[1]

'forest fire near ronge sask canada'

In [27]:
data_train['FinalText'] = final_sentence_list
data_train.head()
data_train = data_train.drop(['text'],axis = 1) # Drop the 'text' column to get clean dataset

In [29]:
data_train.head()

Unnamed: 0,target,FinalText
0,1,deed reason earthquake may allah forgive
1,1,forest fire near ronge sask canada
2,1,resident asked shelter place notified officer ...
3,1,people receive wildfire evacuation order calif...
4,1,got sent photo ruby alaska smoke wildfire pour...


# Convert Finaltext into a numerical vector

In [31]:
x_train = data_train['FinalText']
y_train = data_train['target']

In [32]:
x_train_array = x_train.to_numpy()
y_train_array = y_train.to_numpy()

### Import Tensorflow package to numerize our text array

In [34]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
tf.config.run_functions_eagerly(True)

In [35]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_array,y_train_array))

In [36]:
train_dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [37]:
for text,label in train_dataset.take(1):
    print('Text: ',text.numpy())
    print('Label: ',label.numpy())

Text:  b'deed reason earthquake may allah forgive'
Label:  1


In [38]:
BUFFER_SIZE = 3000
BATCH_SIZE = 128

In [39]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [40]:
VOCAB_SIZE = 12000


#This layer will only be used in LSTM and GRU architectures for obtaining numerical vector representation of words. 
#For BERT we will use bert spcific vectorization technique.

encoder = tf.keras.layers.TextVectorization(max_tokens = VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text,target: text))



In [41]:
vocabulary = np.array(encoder.get_vocabulary())
vocabulary[1:10]

array(['[UNK]', 'fire', 'like', 'get', 'im', 'new', 'one', 'people',
       'time'], dtype='<U29')

The encoder function can convert the texte array into numerical array

In [42]:
print('Original Text :' +str(text))
encoded_text = encoder(text).numpy()
print('Numeric Representaion :' +str(encoded_text))

Original Text :tf.Tensor(b'deed reason earthquake may allah forgive', shape=(), dtype=string)
Numeric Representaion :[5033  425  220   60 1424 4811]


In [43]:
len(encoder.get_vocabulary())

12000

# Build LSTM Model

In [44]:
model = Sequential()
model.add(encoder)
model.add(Embedding(input_dim=len(encoder.get_vocabulary()),output_dim=16,mask_zero = True))
model.add(Bidirectional(LSTM(16,return_sequences = True)))
model.add(Dropout(0.20))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.20))
model.add(Dense(10,activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(1))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 16)          192000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 32)         4224      
 l)                                                              
                                                                 
 dropout (Dropout)           (None, None, 32)          0         
                                                                 
 global_max_pooling1d (Globa  (None, 32)               0         
 lMaxPooling1D)                                                  
                                                        

In [45]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss',factor=0.25,patience=2,min_lr=0.001)

In [46]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

In [47]:
history = model.fit(train_dataset,epochs = 5,callbacks = [reduce_lr])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Prepare our test dataset

In [49]:
sent = []
RemoveUnnecessaryChar(data_test['text'])

In [50]:
tokenized_sent = []
TokenizeSentence(sent)

In [51]:
formatted_sent = []
stopwordsSentence(tokenized_sent)

In [52]:
lemma_sent = []
lemmatizeSentence(formatted_sent)

In [53]:
final_sentence_list = []
finalSentence(lemma_sent)

In [55]:
data_test['text'] = final_sentence_list
data_test

Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different city stay safe ever...
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill china taiwan
...,...
3258,earthquake safety los angeles safety fastener ...
3259,storm worse last hurricane city others hardest...
3260,green line derailment chicago
3261,meg issue hazardous weather outlook hwo


In [56]:
x_test = data_test['text']
x_test_array = x_test.to_numpy()

In [57]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test_array))
for test_text in test_dataset.take(2):
    print('Text: ', test_text.numpy())

Text:  b'happened terrible car crash'
Text:  b'heard earthquake different city stay safe everyone'


In [58]:
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [59]:
y_pred = model.predict(test_dataset)

In [60]:
y_pred

array([[1.2798808],
       [1.8644623],
       [4.266029 ],
       ...,
       [1.4505811],
       [2.8034046],
       [1.9419159]], dtype=float32)

### Make 'Target' column: If prediction probability > 0, there is a disaster. Otherwise there is no disaster.

In [61]:
result = []
for i in y_pred:
    if i >= 0:
        result.append(1)
    else: 
        result.append(0)

In [63]:
submission = pd.read_csv('C:/Users/yanli/Downloads/sample_submission.csv')
submission['target'] = result
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [64]:
submission['target'].value_counts()

0    1953
1    1310
Name: target, dtype: int64

In [66]:
submission.to_csv('C:/Users/yanli/Downloads/NLPsubmission.csv', index=False)