In [7]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
import tensorflow as tf

from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [8]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Running on TPU: ", tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
    
print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU:  
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.




INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


REPLICAS:  8


In [9]:
train = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv") # Jigsaw toxic comment
validation = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv")
test = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv")


Approaching this problem as a binary classification (12000 data points to speed up training)

In [11]:
train.drop(['severe_toxic', 'obscene','threat','insult','identity_hate'], axis = 1, inplace = True)

In [12]:
train = train.loc[:12000, :]
train.shape

(12001, 3)

In [13]:
train['comment_text'].apply(lambda x: len(str(x).split())).max()

1403

In [14]:
# Getting AUC score

def roc_auc(predictions, target):
    """
    This method returns the AUC Score when given the Predictions and Labels
    """
    fpr,tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

#### Data Preparation


In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(train.comment_text.values, 
                                                      train.toxic.values,
                                                     stratify = train.toxic.values, 
                                                      random_state = 42, 
                                                      test_size = 0.2,
                                                     shuffle = True)

### Simple RNN

Recurrent Neural Network (RNN): type of NN where the output from previous step are fed as input to the current step. In traditional NN all inputs and outputs are independent since language is related is important to know the previous words in the sentence in order to understand the context.

In [24]:
# Keras tokenizer

token = text.Tokenizer(num_words = None)
max_len = 1500

token.fit_on_texts(list(X_train) + list(X_valid))
X_train_seq = token.texts_to_sequences(X_train)
X_valid_seq = token.texts_to_sequences(X_valid)

# Zero padding the sequences 
X_train_pad = tf.keras.utils.pad_sequences(X_train_seq, maxlen = max_len)
X_valid_pad = tf.keras.utils.pad_sequences(X_train_seq, maxlen = max_len)

word_index = token.word_index

In [25]:
%%time 
with strategy.scope():
    # A simple RNN w/o any pretrained embeddings and dense layers
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                       300,
                       input_length = max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: user 2.13 s, sys: 1.15 s, total: 3.28 s
Wall time: 2.65 s


In [28]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=64*strategy.num_replicas_in_sync)
# Multiplying by strategy to run on TPU's

Epoch 1/5


2023-10-26 00:41:22.117612: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
2023-10-26 00:41:22.242062: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f33b8182e20>

In [32]:
scores = model.predict(X_valid_pad)
print(f"AUC: {roc_auc(scores, y_valid):.2f}")



ValueError: Found input variables with inconsistent numbers of samples: [2401, 9600]

In [30]:
scores_model = []
scores_model.append({"Model": "SimpleRNN", "AUC_Score" : roc_auc(scores, y_valid)})

ValueError: Found input variables with inconsistent numbers of samples: [2401, 9600]

#### Explanation 

- Tokenization 

A sentence is inputted word by word. Each word is represented as one hot encoded vector of dimensions (Number of words in vocab + 1).
The Keras tokenizer work by taking all of the unique words in the text, forms a dictionary with words as keys and their frequency in the texts as values. The dictionary is then sorted in descending order of counts. 

In [31]:
X_train_seq[:1]

[[664,
  65,
  7,
  19,
  2262,
  14102,
  5,
  2262,
  20439,
  6071,
  4,
  71,
  32,
  20440,
  6620,
  39,
  6,
  664,
  65,
  11,
  8,
  20441,
  1502,
  38,
  6072]]

`model.Sequential()` tells keras that we will be bulding the Network Sequentially. We start by adding the Embedding Layer of neurons which takes in as input the nth dimensional One-hot vector of every word and converts it into 300 dimensional vector, it gives us word embeddings similar to `word2vec`. Could've used `word2vec` for this but the `Embedding` layer learns during the training to enhance the embeddings. Next we added the 100 LSTM units without any dropout or regularization. In the end a singlue neuron sigmoid function that takes outputs from 100 LSTM cells (These are 100 LSTM cells not layers) for predicting the results and then compiling the model using `Adam` optimizer. 