In [1]:
!pip install tensorflow-text
!pip install -q tf-models-official

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import re
import requests
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.model_selection import StratifiedShuffleSplit

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

EMOJIS = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

def remove_punct(tweet):
    return re.sub(r'[^\w\s]', '', tweet)
def remove_word(word):
    def remove_specific(tweet):
        return tweet.replace(word, '')
    return remove_specific
def remove_url(string):
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', '', string)
def remove_html(string):
    return re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', string)
def remove_useless_char(string):
    thestring = re.sub(r'[^a-zA-Z\s]','', string)
    thestring = re.sub(r'\b\w{1,2}\b', '', thestring)
    return re.sub(' +', ' ', thestring) 
def remove_emojis(string):
    for emoji in EMOJIS.keys():
        string = string.replace(emoji, "EMOJI" + EMOJIS[emoji])  
    return string
def make_lowercase(string):
    return string.lower()
def remove_nums(string):
    return re.sub(r'\d+', '', string)

def preprocess(dataset):
    # Helpful link for tokenization
    # https://www.kdnuggets.com/2020/03/tensorflow-keras-tokenization-text-data-prep.html
    dataset["text"] = dataset["text"].apply(make_lowercase)
    dataset["text"] = dataset["text"].apply(remove_url)
    dataset["text"] = dataset["text"].apply(remove_html)
    dataset["text"] = dataset["text"].apply(remove_punct)
    dataset["text"] = dataset["text"].apply(remove_nums)
    dataset["text"] = dataset["text"].apply(remove_emojis)
    return dataset        

In [2]:
# Load train dataset and preprocess the texts
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_df = train_df.drop(columns=["id", "keyword", "location"])
train_df = preprocess(train_df)
# Load test dataset and preprocess the texts
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_df = test_df.drop(columns=["keyword", "location"])
test_df['text'] = test_df['text'].replace("", "empty") # For tweets with empty strings
test_df = preprocess(test_df)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=100)
strat_train = []
strat_val = []
for train_index, val_index in split.split(train_df["text"], train_df['target']):
    strat_train.append(train_df.loc[train_index])
    strat_val.append(train_df.loc[val_index])
strat_train = pd.DataFrame(strat_train[0], columns=['text', 'target'])
strat_val = pd.DataFrame(strat_val[0], columns=['text', 'target'])
x_train = strat_train["text"]
y_train = strat_train["target"]
x_val = strat_val["text"]
y_val = strat_val["target"]

sns.countplot(y_train)
sns.countplot(y_val)

In [3]:
bert_preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_encoder = bert_model_url
tfhub_handle_preprocess = bert_preprocess_url

def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()
tf.keras.utils.plot_model(classifier_model)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metrics = tf.metrics.BinaryAccuracy()
opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
classifier_model.compile(optimizer=opt,
                         loss=loss,
                         metrics=metrics)

In [4]:
history = classifier_model.fit(x=x_train,
                               y=y_train,
                               validation_data=(x_val, y_val),
                               epochs=2,
                               callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)],
                               batch_size=16,
                              )

In [5]:
# Save output to csv file
predictions = classifier_model.predict(test_df["text"])
predictions = np.round(predictions).astype(np.short)

output_df = pd.DataFrame(test_df['id'])
output_df['target'] = predictions

out = output_df.to_csv('hyukahn_rnn.csv', index=False)

Largest small BERT performance bad <br/>
Electra <br/>
Expert-wikibook - bad <br/>
BERT uncased - <br/>

In [6]:
# Print to console in case Kaggle notebook times out
predictions = classifier_model.predict(test_df["text"])
predictions = np.round(predictions).astype(np.short)

output_df = pd.DataFrame(test_df['id'])
output_df['target'] = predictions

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(output_df)