## 1. Install and Import dependencies

In [None]:
%pip install -r requirements.txt

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

## 2. Load Data

In [3]:
df = pd.read_csv(os.path.join('data', 'train.csv'))
df.shape

(159571, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df.tail(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [23]:
df[df['toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


## 2. Preprocessing

In [8]:
from tensorflow.keras.layers import TextVectorization

In [9]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [10]:
MAX_TOKENS = 200000 #limit max number of words in the vocabulary

In [11]:
#create text vectoriser with max vocabulary size to 200000 tokens, output sequence length for each comment to 1800
vectorizer = TextVectorization(max_tokens=MAX_TOKENS,
                               output_sequence_length=1800,
                                output_mode = 'int' )

In [12]:
#create vocabulary
vectorizer.adapt(X.values)

In [15]:
#visualize vocabulary
vectorizer.get_vocabulary()

['',
 '[UNK]',
 np.str_('the'),
 np.str_('to'),
 np.str_('of'),
 np.str_('and'),
 np.str_('a'),
 np.str_('you'),
 np.str_('i'),
 np.str_('is'),
 np.str_('that'),
 np.str_('in'),
 np.str_('it'),
 np.str_('for'),
 np.str_('this'),
 np.str_('not'),
 np.str_('on'),
 np.str_('be'),
 np.str_('as'),
 np.str_('have'),
 np.str_('are'),
 np.str_('your'),
 np.str_('with'),
 np.str_('if'),
 np.str_('article'),
 np.str_('was'),
 np.str_('or'),
 np.str_('but'),
 np.str_('page'),
 np.str_('my'),
 np.str_('an'),
 np.str_('from'),
 np.str_('by'),
 np.str_('do'),
 np.str_('at'),
 np.str_('about'),
 np.str_('me'),
 np.str_('so'),
 np.str_('wikipedia'),
 np.str_('can'),
 np.str_('what'),
 np.str_('there'),
 np.str_('all'),
 np.str_('has'),
 np.str_('will'),
 np.str_('talk'),
 np.str_('please'),
 np.str_('would'),
 np.str_('its'),
 np.str_('no'),
 np.str_('one'),
 np.str_('just'),
 np.str_('like'),
 np.str_('they'),
 np.str_('he'),
 np.str_('dont'),
 np.str_('which'),
 np.str_('any'),
 np.str_('been'),
 np

In [None]:
#Vectorize the comments using the pre-trained vocabulary
vectorized_text = vectorizer(X.values)

In [None]:
print(vectorized_text) #shape is 1800 columns as we limit the output sequence length to 1800. for smaller comments, the output is filled with zeros.

tf.Tensor(
[[  645    76     2 ...     0     0     0]
 [    1    54  2489 ...     0     0     0]
 [  425   441    70 ...     0     0     0]
 ...
 [32445  7392   383 ...     0     0     0]
 [    5    12   534 ...     0     0     0]
 [    5     8   130 ...     0     0     0]], shape=(159571, 1800), dtype=int64)


In [20]:
#MCSHBAP - Map Cache Shuffle Batch Prefetch : data preprocessing steps 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache() #tf used cached data instead of recaluculating it. speeds up data loading and processing
dataset = dataset.shuffle(buffer_size=160000) #Randomly reorders elements in a dataset
dataset = dataset.batch(batch_size=16) #Groups elements of the dataset into batches of the specified size
dataset = dataset.prefetch(buffer_size=8)#Prepares the next batch of data while the current batch is being processed, enabling parallel execution.

In [23]:
#get the next batch
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [25]:
batch_X.shape

(16, 1800)

In [26]:
len(dataset) #number of batches

9974

In [27]:
#train, validation and test split of batches
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2)+1)
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1)+1)


## 2. Create Sequential Model

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding

In [29]:
#Architecture of our Sequential model
model = Sequential()
#Creating an Embedding Layer
model.add(Embedding(input_dim=MAX_TOKENS+1, output_dim=32))
#Birectional wrapper on LSTM layer
model.add(Bidirectional(LSTM(units=32, activation='tanh')))
#Fully Connected layers
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units= 256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
#Final Layer
model.add(Dense(units=6, activation='sigmoid'))


In [30]:
#loss is binarycrossentropy as output is multioutput with each being binary(0 or 1) (not categorical in nature)
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [31]:
#Training
hist = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6356s[0m 910ms/step - loss: 0.0870 - val_loss: 0.0504


## 3. Make Predictions

In [None]:
#sample text to predict toxicity levels
input_text = vectorizer('you dog! how dare you. i kill you.')

#need more training data to train threat and identity_hate levels as they are not classified as expected.


In [48]:
#predict
model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step


array([[0.9194784 , 0.02880241, 0.64095044, 0.03426983, 0.52797043,
        0.08968459]], dtype=float32)

In [46]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [None]:
#predtictions on test data
batch_test_X, batch_test_y = test.as_numpy_iterator().next()

In [40]:
model.predict(batch_test_X)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395ms/step


array([[9.37270403e-01, 3.68315615e-02, 7.00359821e-01, 3.81270982e-02,
        5.72996736e-01, 9.63530615e-02],
       [2.14353250e-03, 1.27473267e-08, 7.11488028e-05, 1.71507099e-06,
        1.17759082e-04, 7.32410117e-05],
       [9.10197198e-02, 5.32741069e-05, 8.93966481e-03, 7.66568526e-04,
        1.17885265e-02, 5.43924188e-03],
       [1.00994180e-03, 1.74843784e-09, 2.39812562e-05, 3.97127536e-07,
        4.13674643e-05, 2.56000803e-05],
       [5.52474894e-03, 1.52724581e-07, 2.71227502e-04, 1.04427718e-05,
        4.30642220e-04, 2.70960241e-04],
       [5.89785911e-03, 1.76884043e-07, 2.95050209e-04, 1.16714573e-05,
        4.68220940e-04, 2.92913639e-04],
       [1.02656859e-03, 1.89295113e-09, 2.50535704e-05, 4.23408977e-07,
        4.30451109e-05, 2.66997286e-05],
       [1.53703883e-03, 5.29294786e-09, 4.42110722e-05, 9.01851877e-07,
        7.43318233e-05, 4.60393749e-05],
       [4.57114214e-03, 9.29110016e-08, 2.07999066e-04, 7.28991836e-06,
        3.32463242e-04, 

### Evalutation and Performance metrics

In [49]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [50]:
# precision , recall and categoricalaccuracy performance metrics
pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [51]:
for batch in test.as_numpy_iterator():
    #unpack the batch
    X_true, y_true = batch
    #make a prediction
    yhat = model.predict(X_true)

    #flatten the prediction
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    rec.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [52]:
print(f'Precison: {pre.result().numpy()}, Recall: {rec.result().numpy()}, CategoricalAccuracy: {acc.result().numpy()}')

Precison: 0.8853638172149658, Recall: 0.5947712659835815, CategoricalAccuracy: 0.4799599051475525


## Save Model

In [53]:
from tensorflow.keras.models import load_model
import os

In [54]:
model.save(os.path.join('models', 'comment_toxicity.h5'))

