In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import requests
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedShuffleSplit
import os

# Super useful RNN guide: https://stackoverflow.com/questions/48714407/rnn-regularization-which-component-to-regularize
# Code below is modified from https://www.kaggle.com/fmitchell259/disaster-tweets-naive-bayes-svm-rnn
EMOJIS = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

def remove_punct(tweet):
    return re.sub(r'[^\w\s]', '', tweet)
def remove_word(word):
    def remove_specific(tweet):
        return tweet.replace(word, '')
    return remove_specific
def remove_url(string):
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', '', string)
def remove_html(string):
    return re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', string)
def remove_useless_char(string):
    thestring = re.sub(r'[^a-zA-Z\s]','', string)
    thestring = re.sub(r'\b\w{1,2}\b', '', thestring)
    return re.sub(' +', ' ', thestring) 
def remove_emojis(string):
    for emoji in EMOJIS.keys():
        string = string.replace(emoji, "EMOJI" + EMOJIS[emoji])  
    return string
def make_lowercase(string):
    return string.lower()
def remove_nums(string):
    return re.sub(r'\d+', '', string)

def preprocess(dataset):
    # Helpful link for tokenization
    # https://www.kdnuggets.com/2020/03/tensorflow-keras-tokenization-text-data-prep.html
    dataset["text"] = dataset["text"].apply(make_lowercase)
    dataset["text"] = dataset["text"].apply(remove_url)
    dataset["text"] = dataset["text"].apply(remove_html)
    dataset["text"] = dataset["text"].apply(remove_punct)
    dataset["text"] = dataset["text"].apply(remove_nums)
    dataset["text"] = dataset["text"].apply(remove_emojis)
    return dataset        

In [None]:
# Load and preprcoess train dataset and test dataset
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

train_df = train_df.drop(columns=["id", "keyword", "location"])
test_df = test_df.drop(columns=["keyword", "location"])
test_df['text'] = test_df['text'].replace("", "empty") # For tweets with empty strings

train_df = preprocess(train_df)
test_df = preprocess(test_df)

In [None]:
# Build token vocabulary
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(train_df["text"])

# Tokenize
train_text = tokenizer.texts_to_sequences(train_df["text"])
test_text = tokenizer.texts_to_sequences(test_df["text"])

# Get max training sequence length
maxlen = max([len(x) for x in train_text])

# Pad the training sequences
pad_type = 'post'
trunc_type = 'post'
train_text = pad_sequences(train_text, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
test_text = pad_sequences(test_text, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

# Convert to TensorFlow Dataset
target = train_df.pop('target') # Get only target/label values
target = tf.reshape(target, shape=(7613, 1))
train_text = tf.reshape(train_text, shape=(7613, 1, 23))
test_text = tf.reshape(test_text, shape=(3263, 1, 23))
train_tf = tf.data.Dataset.from_tensor_slices((train_text, target))
test_tf = tf.data.Dataset.from_tensor_slices((test_text))

# Split the training data into train and valid
train_tf = train_tf.shuffle(7613, reshuffle_each_iteration=False)
train_tf_valid = train_tf.take(2283)
train_tf_train = train_tf.skip(2283)

In [None]:
# Model code
vocab_size = len(tokenizer.word_index) + 2
tf.random.set_seed(42)
embedding_dimension = 100

bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dimension),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, activation='relu')),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

bi_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(5e-4),
                     loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                     metrics=['accuracy'])

history = bi_lstm_model.fit(train_tf_train, validation_data=train_tf_valid, epochs=5, callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)])
print("TRAINING DONE")

In [None]:
predictions = bi_lstm_model.predict(test_tf, batch_size=None)
predictions = np.round(predictions).astype(np.short)

output_df = pd.DataFrame(test_df['id'])
output_df['target'] = predictions

out = output_df.to_csv('hyukahn_rnn.csv', index=False)