### **Description**:Learn how to use RNN in Tensorflow. To use NLP (Natural Language Processing) techniques like a Tokenizer and Word Embeddings to preprocess text data, and then create a RNN model with keras to classify the tweets.


I used a NLP Disaster Tweets dataset .

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
# https://www.kaggle.com/c/nlp-getting-started
df = pd.read_csv('data/twitter_train.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

In [None]:
# Preprocessing
import re
import string

def remove_URL(test):
  url = re.compile(r"https?://\S+|www\.\S+")
  return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x
def remove_punct(text):
  translator = str.maketrans("", "", string.punctuation)
  return text.translate(translator)

string.punctuation

In [None]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
  matches = pattern.findall(t)
  for match in matches:
    print(t)
    print(match)
    print(pattern.sub(r"", t))
  if len(matches) > 0:
    break

In [None]:
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)

In [None]:
# remove stopwords
# pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop words: A stop word is a commonly used word (the, an, a)
stop = set(stopwords.words('english'))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

In [None]:
stop

In [None]:
df["text"] = df.text.map(remove_stopwords)

In [None]:
df.text

In [None]:
from collections import Counter

# Count unique words
def counter_word(text_col):
  count = Counter()
  for text in text_col.values:
    for word in text.split():
      count[word] += 1
  return count

counter = counter_word(df.text)

In [None]:
len(counter)

In [None]:
counter

In [None]:
counter.most_common(5)

In [None]:
num_unique_words = len(counter)

In [None]:
# Split dataset into training and validation set
train_size = int(df.shape[0] * .8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [None]:
train_sentences.shape, val_sentences.shape

In [None]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [None]:
# each word has unique index
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
train_sequence = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [None]:
print(train_sentences[10:15])
print(train_sequences[10:15])

In [None]:
# Pad the sequences to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
train_padded.shape, val_padded.shape

In [None]:
train_padded[10]

In [None]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

In [None]:
# check reversing the indices

# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [None]:
reverse_word_index

In [None]:
def decode(sequence):
  return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [None]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

In [None]:
# Create LSTM model
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

model.add(layers.LSTM(64, dropout=.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [None]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

In [None]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])