In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm

from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping



import matplotlib.pyplot as plt
import seaborn as sns 
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
from tensorflow.keras.utils import pad_sequences

In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Running on TPU: ", tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
    
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
train = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv") # Jigsaw toxic comment
validation = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv")
test = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv")

Approaching this problem as a binary classification (12000 data points to speed up training)

In [5]:
train.drop(['severe_toxic', 'obscene','threat','insult','identity_hate'], axis = 1, inplace = True)

In [6]:
train = train.loc[:12000, :]
train.shape

(12001, 3)

In [7]:
train['comment_text'].apply(lambda x: len(str(x).split())).max()

1403

In [8]:
# Getting AUC score

def roc_auc(predictions, target):
    """
    This method returns the AUC Score when given the Predictions and Labels
    """
    fpr,tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

#### Data Preparation


In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(train.comment_text.values, 
                                                      train.toxic.values,
                                                     stratify = train.toxic.values, 
                                                      random_state = 42, 
                                                      test_size = 0.2,
                                                     shuffle = True)

### Simple RNN

Recurrent Neural Network (RNN): type of NN where the output from previous step are fed as input to the current step. In traditional NN all inputs and outputs are independent since language is related is important to know the previous words in the sentence in order to understand the context.

In [10]:
# Keras tokenizer

token = text.Tokenizer(num_words = None)
max_len = 1500

token.fit_on_texts(list(X_train) + list(X_valid))
X_train_seq = token.texts_to_sequences(X_train)
X_valid_seq = token.texts_to_sequences(X_valid)

# Zero padding the sequences 
X_train_pad = tf.keras.utils.pad_sequences(X_train_seq, maxlen = max_len)
X_valid_pad = tf.keras.utils.pad_sequences(X_train_seq, maxlen = max_len)

word_index = token.word_index

In [11]:
%%time 
with strategy.scope():
    # A simple RNN w/o any pretrained embeddings and dense layers
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                       300,
                       input_length = max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: user 1.06 s, sys: 800 ms, total: 1.86 s
Wall time: 3.09 s


In [12]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=64*strategy.num_replicas_in_sync)
# Multiplying by strategy to run on TPU's

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7b92e65f32b0>

### Accuracy is 100% overfitting to the trianing data

In [13]:
model.save('RNN_Model.h5')

In [15]:
scores = model.predict(X_valid_pad)



In [17]:
scores.shape

(9600, 1)

In [18]:
y_valid.shape

(2401,)

In [20]:
y_pred = scores[:2401]

In [22]:
y_pred.shape

(2401, 1)

In [24]:
print(f"AUC: {roc_auc(y_pred, y_valid):.2f}")

AUC: 0.51


In [26]:
scores_model = []
scores_model.append({"Model": "SimpleRNN", "AUC_Score" : roc_auc(y_pred, y_valid)})

#### Explanation 

- Tokenization 

A sentence is inputted word by word. Each word is represented as one hot encoded vector of dimensions (Number of words in vocab + 1).
The Keras tokenizer work by taking all of the unique words in the text, forms a dictionary with words as keys and their frequency in the texts as values. The dictionary is then sorted in descending order of counts. 

In [27]:
X_train_seq[:1]

[[664,
  65,
  7,
  19,
  2262,
  14102,
  5,
  2262,
  20439,
  6071,
  4,
  71,
  32,
  20440,
  6620,
  39,
  6,
  664,
  65,
  11,
  8,
  20441,
  1502,
  38,
  6072]]

**Padding** the sequence allows for batch training and can help prevent the model to overfitting on the data. It is efficient to process data in batches this is done by matrices `[batch_size * sequence_length]`. If there's a variation in the `sequence_length` then it'll be the longest sequence, so the sequences are padded with 0s to fit the matrix size and its masked to not be accounted for in loss calculation. 
Padded tokens may not impact model training it still requires compute to be processed and sorting data by length helps limit the amount of padding.

Also can use special tokens while tokenizing for beginning of string (BOS) and end of string (EOS). This is done so the knows when to stop especially if the response has multiple sentences and thus a period token would not be sufficient. 
The BOS token allows for the model to choose the first word when being told to do a task where it simply is not completing a sentence, like writing "original" poetry.

`model.Sequential()` tells keras that we will be bulding the Network Sequentially. We start by adding the Embedding Layer of neurons which takes in as input the nth dimensional One-hot vector of every word and converts it into 300 dimensional vector, it gives us word embeddings similar to `word2vec`. Could've used `word2vec` for this but the `Embedding` layer learns during the training to enhance the embeddings. Next we added the 100 LSTM units without any dropout or regularization. In the end a singlue neuron sigmoid function that takes outputs from 100 LSTM cells (These are 100 LSTM cells not layers) for predicting the results and then compiling the model using `Adam` optimizer. 



### Word Embeddings

Word embeddings is a learned representation for text where words that have the same meaning have a similar representation. 

It's easier to use pretrained models like GLoVe, Word2Vec, fasttext. 

In [None]:
embeddings_index = {}

f = open('/kaggle/input/glove840b300dtxt/glove.840B.300d.txt',
        'r', encoding = 'utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print(f"Found {embeddings_index} word vectors.")

## LSTM

#### Overview 
Simple RNNs perform better than classical ML algos but they fail to capture the long term dependencies that is present in sentences. 1998-99 LSTMs were introduced to counteract these inefficiencies. 

RNNs also have an issue with vanishing gradients

In [None]:
# create an embedding matric for the words we have in the dataset
embedding_matrix = np.zeros(len(word_index) + 1, 300)
for word, i in tqdm(word_index.items()):
    embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
%%time 

with strategy.scope():
    
    # A simple LSTM with GLoVe embeddings and one dense layer
    
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300,
             weights = [embedding_matrix],
             input_length = max_len,
             trainable = False))
    model.add(LSTM(100, dropout = 0.3, recurrent_dropout = 0.3))
    model.add(Dense(1, activation = 'sigmoid'))
    model.comile(loss = 'binary-crossentropy', 
                optimizer = 'adam',
                metrics = ['accuracy'])
model.summary()

In [None]:
model.fit(X_train_pad, y_train, 
          epochs=5, 
          batch_size = 64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(X_valid_pad)
print(f"AUC: {roc_auc(scores, y_valid)}")

In [None]:
scores_model.append({"Model" : "LSTM",
                    "AUC_Score" : roc_auc(scores, y_valid)})

### Explanation

Calculated the embeddings matrix for the vocabulary from the pretrained GLoVe vectors. In the Sequential model while building the Embedding layers the Embedding matrix is passed as weights to the layer instead of training it over the Vocabulary and so `trainable = False`. Same as RNN but replaced with LSTM units.

### GRUs

Gated Recurrent Unit is designed to solve the vanishing gradient problem which comes with a standard RNN. GRU is a variation on the LSTM b/c both share a similar design but is suppose to be simpler and faster than LSTMs and in many cases produce equally good results.

In [None]:
%%time
with strategy.scope():
    # GRU with GLoVe embdeddings and 2 dense layers 
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                       300,
                       weights=[embedding_matrix],
                       input_length=max_len,
                       trainable = False))
    model.add(SpatialDropout1D(0.3))
    model.add(GRU(300))
    model.add(Dense(1,activation = 'sigmoid'))
    
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train_pad, y_train, epochs=5,
         batch_size = 64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(X_valid_pad)
print(f"AUC: {roc_auc(scores, y_valid)}")

In [None]:
scores_model.append({"Model": "GRU",
                    "AUC Score": roc_auc(scores, y_valid)})

In [None]:
scores_model

### Bi-Directional RNNs


In [23]:
%%time 
with strategy.scope():
    # Bidirectional LSTM with GLoVe embeddings & 1 Dense Layer
    model = Sequential()
    model.add(Embeddings(len(word_index) + 1,
                        300,
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable = False))
    model.add(Bidirectional(LSTM(300, dropout = 0.3,
                                recurrent_dropout = 0.3)))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])

model.summary()

NameError: name 'Embeddings' is not defined

In [None]:
model.fit(X_train_pad, y_train, epochs=5, batch_size= 64*strategy.num_replicas_in_sync)


In [None]:
scores = model.predict(X_valid_pad)
print(f"AUC: {roc_auc(scores,y_valid)}")

In [None]:
scores_model.append({"Model" : "Bi-directional LSTM",
                    "AUC_Score" : roc_auc(scores, y_valid)})

#### Explanation 

Added Bidirection to the LSTM layer.

### Seq2Seq Model Architecture

#### Overview 

A many to many RNN architecture where the input is a sequence and the output is also a sequence (Where the input and output sequence can be of different lengths). Generally used in applications like Machine Translation, text summarization, question answering etc.

In [None]:
# Visualizing results obtained from various Deep learning models 
results = pd.DataFrame(scores_model).sort_values(by='AUC_Score', ascending = False)
result.style.background_gradient(cmap='Blues')

In [None]:
fig = go.Figure(go.Funnelarea(
text = results.Model,
values = results.AUC_Score,
title = {"position":"top center", "text" : "Funnel-Chart of Sentiment Distribution"}))
fig.show()

### Attention Models

In [None]:
import os 
import tensorflow as tf 
from tensorflow.keras.layer import Dense, Input
from tensorflow.keras.optimizer import Adam
from tensorflow.keras.models import Model
from tesnorflow.keras.callbacks import ModelCheckpoint
from kaggle_dataset import KaggleDatasets
import transformers

from tekenizer import BertWordPieceTokenizer


In [None]:
train
valid
test

In [None]:
def fast_encode(texts, tokenizer, chunk_size = 256, maxlen = 512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length = maxlen)
    tokenizer.enable_padding(max_length = maxlen)
    all_ids=[]
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
        
    return np.array(all_ids)

In [None]:
# IMP Data for Config

AUTO = tf.data.experimental.AUTOTUNE

# configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

#### Tokenization

In [None]:
# Load the tokenizer 
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reloading with HuggingFace tokenizer library
fast_tokenizer = BertWordPieceTokenizer("vocab.txt",
                                       lowercase = False)
fast_tokenizer

In [None]:
X_train = fast_encode(train.comment_text.astype(str),
                      fast_tokenizer,
                     maxlen = MAX_LEN)
X_valid = fast_encode(valid.comment_text.astype(str),
                     fast_tokenizer,
                     maxlen = MAX_LEN)
X_test = fast_encode(test.content.astype(str),
                    fast_tokenizer,
                    maxlen = MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

In [None]:
train_dataset = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))\
    .repeat().shuffle(2048).batch(BATHCH_SIZE).prefetch(AUTO)
)

valid_dataset = (
tf.data.Dataset.from_tensor_slices((X_valid, y_valid))\
.batch(BATCH_SIZE).cache().prefetch(AUTO) )

test_dataset = (
    tf.data.Dataset.from_tensor_slices(X_test).batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, max_len = 512):
    """
    Function for training the BERT Model
    """
    input_word_ids = Input(shape=(max_len),
                          dtype=tf.int32,
                          name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation = 'sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, output = out)
    model.compile(Adam(lr=1e-5), loss = 'binary-crossentropy',
                 metrics=['accuracy'])
    
    return model

In [None]:
%%time 
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
    )
        model = build_model(transformer_layer, max_len=MAX_LEN)
        
model.summary()

In [None]:
n_steps = X_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch = n_steps,
    validation_data = valid_dataset,
    epochs = EPOCHS
)

In [None]:
n_steps = X_valid.shape[0] // BATCH_SIZE

train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch = n_steps,
    epochs = EPOCHS*2
)

In [None]:
sub['toxic'] = model.predict(test_dataset, verbose = 1)
sub.to_csv('submission.csv', index = False)