# NLP with disaster tweets using LSTM (tf)

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re
import string
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
import numpy as np 
import pandas as pd 
import seaborn as sns
from wordcloud import WordCloud

### The problem
In this project, we will build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. We will have access to a dataset of 10,000 tweets that were hand classified. Eventually, the LSTM model that we will build will help to solve the problem by classifying the tweets in test dataset into disastrous content or not.

### Explore the data folder structure

In [None]:
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Exploratory Data Analysis

In [None]:
train=pd.read_csv("../input/nlp-getting-started/train.csv")

In [None]:
train.info()

In [None]:
train.text[:10]

### Preprocessing
As we can find from samples of the text that the tweet content may come with sepecial characters like *#*, *'*, "=>" etc. There is also upper case mixed with lower case words together. We would like to clean up the data by stripping unmeaning characters by having the text processed by the pipelines below:

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def lowercase(text):
    return text.lower()

def clean(text):
    t  = re.sub(r"[^a-zA-Z]", " ", text)
    t  = t.replace("https"," ")
    t  = t.replace("http"," ")
    return t.translate(str.maketrans("","", string.punctuation))

def tokenize(text):
    t = text.split()
    t = [word for word in t if not word in stop_words]
    return ' '.join(filter(str.isalpha, t))

preprocessors = [clean, lowercase, tokenize]

for preprocessor in preprocessors:
    train['text']= train['text'].apply(preprocessor)

In [None]:
train['count_of_words']=[len(x.split()) for x in train['text'].tolist()]

In [None]:
train.head(10)

In [None]:
unique_counts = train['target'].value_counts()
print(unique_counts)

plt.pie(unique_counts.tolist(), 
        labels = ['0-Non-disastrous','1-disastrous'],
        autopct='%1.2f%%'
       )
plt.show()

In [None]:
train['count_of_words'].describe()

In [None]:
from collections import Counter
count = Counter()
def count_word(text):
    for x in text.values:
        for word in x.split():
            count[word]+=1
    return count
counter= count_word(train.text)

In [None]:
len(counter)

In [None]:
most = counter.most_common()
x=[]
y=[]
for word,count in most[:20]:
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

In [None]:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join([i[0] for i in most[:50]]))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Train and validation data
We would like split the train data into train and validation datasets based on the 80-20 principal. 
The train data will be used for model to fit. The validation will be used to verify the model's loss and accuracy. To avoid overfit, data will be shuffled before training.

In [None]:
features, targets= train['text'], train['target']
all_train_features, val_features, all_train_targets, val_targets = train_test_split(
        features, targets,
        train_size=0.8,
        random_state=42,
        shuffle = True,
        stratify=targets
    )

In [None]:
train_data= tf.data.Dataset.from_tensor_slices((all_train_features.to_numpy(), all_train_targets.to_numpy())) 
val_data= tf.data.Dataset.from_tensor_slices((val_features.to_numpy(),val_targets.to_numpy()))

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_data = val_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

We are not done with the preprocessing yet. A **TextVectorization** layer has basic options for managing text in a Keras model. It transforms a batch of strings into either a list of token indices.

Going next, we will call this layer's adapt() method on the dataset. When this layer is adapted, it will analyze the dataset, determine the frequency of individual string values, and create a 'vocabulary' from them. 

In [None]:
vocab_size= 20000
max_len= 15
vectorize_layer = TextVectorization(
    max_tokens=vocab_size + 2,
    split="whitespace",
    output_mode="int",
    output_sequence_length=max_len,
)

In [None]:
vectorize_layer.adapt(train_data.map(lambda text, label: text))
vectorize_layer.adapt(val_data.map(lambda text, label: text))

In [None]:
vocab = np.array(vectorize_layer.get_vocabulary())
vocab[:20]

### Model architecture
LSTM will be used for this NLP problem: the first layer is the Embedded layer that uses vectors to represent each word, which will be followed by bidirectional LSTM layer. The main reason for a bidirectional LSTM is that every component of an input sequence has information from both the past and present. For this reason, bidirectional LSTM can produce a more meaningful output, combining LSTM layers from both directions. The next a few layers will be regular dense layers with relu activation. Dropouts are also enabled to mitigate overfitting problem.  Model architecture is summarized below after model construct and is also plotted to provide finer details. The output layer comes with sigmoid activation to address the binary classification problem. 

In [None]:
vocab_size= 20000
max_len= 15
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim= vocab_size+1,output_dim=max_len,mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, activation='tanh',return_sequences=True)),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.LSTM(64,return_sequences=True),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1,activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
tf.keras.utils.plot_model(model,show_shapes=True)

In [None]:
histor = model.fit(train_data, epochs=50,
                    validation_data= val_data)

### Model performance visualization
With both train and validation datasets, we will plot Accuracy vs Epochs. We expect with epoch's growth, the accuracy shall go up. On the other hand, the loss will go down when epoch grows.

In [None]:
history_dict = histor.history
history_dict.keys()

In [None]:
acc      = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']
loss     = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'blue', label='Training Loss')
plt.plot(epochs, val_loss, 'orange', label='Validation Loss')
plt.title('Loss vs Epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.title('Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(epochs, acc, 'blue', label='Train Accuracy');
plt.plot(epochs, val_accuracy, 'orange', label='Validation Accuracy' )
plt.show()

### Prediction
We will use the trained model to make prediction for submission. The text from test dataset must go through the same preprocessing pipeline as how train data went through.

In [None]:
test= pd.read_csv("../input/nlp-getting-started/test.csv")
for preprocessor in preprocessors:
    test['text']= test['text'].apply(preprocessor)

In [None]:
test_data = test['text']
test_dat_array = test_data.to_numpy()
test_dat_array = tf.data.Dataset.from_tensor_slices((test_dat_array))
test_dat_array = test_dat_array.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
prediction = model.predict(test_dat_array)

In [None]:
prediction_target = []
for pred in prediction:
    if pred >= 0.5:
        prediction_target.append(1)
    else: 
        prediction_target.append(0)

In [None]:
submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission

In [None]:
submission['target'] = prediction_target
submission

In [None]:
unique_counts = submission['target'].value_counts()
print(unique_counts)

plt.pie(unique_counts.tolist(), 
        labels = ['0-Non-disastrous','1-disastrous'],
        autopct='%1.2f%%'
       )
plt.show()

In [None]:
submission.to_csv("submission.csv",index=False)

### Conclusion
Bidirectional LSTMs can be used to train two sides, instead of one side of the input sequence. First from left to right on the input sequence and the second in reversed order of the input sequence. It provides one more context to the word to fit in the right context from words coming after and before, this results in faster and fully learning and solving a problem. Due to these characteristics, we built a model by fitting it into cleaned tweets text for training and made prediction for test data.

As we analyze the performace by looking at the validation accuracy and loss, there are still room to improve, which could be caused by overfitting. Tuning up dropout rates may be a direction to look into as the next step.