# 1.) Install dependancies and load the data

In [31]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn





In [32]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [33]:
df = pd.read_csv(r'C:\Users\Atul\Downloads\data_toxicity_comments\data.csv')  #Since training data is large enough we use this as our entire dataset
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [34]:
len(df) # this dataset is too large to be trained on our local system so we randomly sample 40000 examples including all toxic examples and ~ 25k examples from non toxic examples


159571

The labels show the kind of toxicity of the comment. Can have mutliple labels for each comment. all 0s indicate the comment is not toxic

In [39]:
df.iloc[0]['comment_text']

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [40]:
df.iloc[:,2:].sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

# 2.) Preprocess the comments


## i.) Tokenize the data using TextVecotrization API of Keras - 

This layer has basic options for managing text in a Keras model. It transforms
  a batch of strings (one example = one string) into either a list of token
  indices (one example = 1D tensor of integer token indices) or a dense
  representation (one example = 1D tensor of float values representing data
  about the example's tokens). This layer is meant to handle natural language
  inputs. To handle simple string inputs (categorical strings or pre-tokenized
  strings) see `tf.keras.layers.StringLookup`.

  The vocabulary for the layer must be either supplied on construction or
  learned via `adapt()`. When this layer is adapted, it will analyze the
  dataset, determine the frequency of individual string values, and create a
  vocabulary from them. This vocabulary can have unlimited size or be capped,
  depending on the configuration options for this layer; if there are more
  unique values in the input than the maximum vocabulary size, the most frequent
  terms will be used to create the vocabulary.

  The processing of each example contains the following steps:

  1. Standardize each example (usually lowercasing + punctuation stripping)
  2. Split each example into substrings (usually words)
  3. Recombine substrings into tokens (usually ngrams)
  4. Index tokens (associate a unique int value with each token)
  5. Transform each example using this index, either into a vector of ints or
     a dense float vector.

In [41]:
from tensorflow.keras.layers import TextVectorization

In [42]:
TextVectorization??

In [43]:
x = df['comment_text']
y = df.iloc[:,2:].values

In [44]:
y  # we want our output in form of vectors we can pass to our model directly

array([[1, 1, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [45]:
max_features = 200000 #number of words we want to include in our dictionary

In [46]:
vectorizer = TextVectorization(max_tokens = max_features, 
                                output_sequence_length = 1800,  #each sentence in our input we will cap at 1800 words and wont take words beyond that so that each sentence's mapped array is of length 1800
                                  output_mode = 'int') # want our encodings to be integers


In [47]:
x.values

array(['COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK',
       "Bye! \n\nDon't look, come or think of comming back! Tosser.", ...,
       "just dumped this bit of POV commentary in the article, which I've reverted.\n\nSubversion's lack of support for certain common features distinguishes it from other version control products:",
       '"\nThe userbox says, ""This user is a native speaker of English."" It makes no claim as to the level that I can contribute at. Yes, my failure to capitalize properly is on the same level as you mispelling \'useful\'. But it is not the same as this. There are many other examples of this poor English usage. \nTwo more things: when  "',
       'Wrestlemania \n\nIt was not a test. In my view TBA looks better than TBD.'],
      dtype=object)

In [48]:
vectorizer.adapt(x.values)  #making our vectorizer learn the dictionary by passing all our comments represneted as a numpy array

In [49]:
vectorizer('Hello World, life is great')  # we get a 1800 dim array where only first 5 words have values and rest 1795 will be 0

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([335, 253, 188, ...,   0,   0,   0], dtype=int64)>

In [50]:
vectorizer('Hello World, life is great')[:5] # so Hello is mapped to 286, World to 261 and so on.....

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([335, 253, 188,   9, 316], dtype=int64)>

In [51]:
# Applying the vectorizer on our data i.e. Tokenizing our dataset
vectorized_text = vectorizer(x.values)

In [52]:
vectorized_text # we get one 1800 length array for each sentence in our datset

<tf.Tensor: shape=(41225, 1800), dtype=int64, numpy=
array([[  518,   159,     3, ...,     0,     0,     0],
       [  306,    37,     9, ...,     0,     0,     0],
       [ 2233,    45,   155, ...,     0,     0,     0],
       ...,
       [   47, 12346,    14, ...,     0,     0,     0],
       [    2,  4128,   333, ...,     0,     0,     0],
       [ 6472,    12,    25, ...,     0,     0,     0]], dtype=int64)>

# 3.) Creating a tensorflow data pipeline

In [53]:
# particularly useful when have data that cant fit in memory so pull data in batches
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000) #shuffle the data to induce randomness
dataset = dataset.batch(16)  #split data into batches of 16 examples. This will automatically enable batch gradient descent.
dataset = dataset.prefetch(8)  #helps prevent bottlenecks

In [54]:
dataset.as_numpy_iterator().next() #get a new batch each time we run this

(array([[ 2619,     5,   879, ...,     0,     0,     0],
        [ 1452,     8, 11885, ...,     0,     0,     0],
        [   18,   128,    11, ...,     0,     0,     0],
        ...,
        [    7,    75,  5467, ...,     0,     0,     0],
        [    3,    17,   118, ...,     0,     0,     0],
        [ 2536,     7,   169, ...,     0,     0,     0]], dtype=int64),
 array([[1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [1, 1, 1, 0, 1, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

In [55]:
len(dataset)  #number of batches

2577

In [56]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [57]:
print(f'Number of batches in train set is {len(train)}, in val set is {len(val)} and in test set is {len(test)}')

Number of batches in train set is 1803, in val set is 515 and in test set is 257


# 4.) Building the Model

Our model will aimt to learn the word embeddings(equivalent of weights in ML and is what our backprop will try to learn) - word embedding is a term used for the representation of words for text analysis, typically in the form of a real-valued vector that encodes the meaning of the word such that the words that are closer in the vector space are expected to be similar in meaning ex: the features learned could be (royalty,fruit and aggression) so each word will be transformed into a 3 vector like the word apple will be (0.1,0.8,0) and word idiot will be (0,0.1,0.8). We cant understand the feature space but can be intuituvely represented like this. So basically if the word 'stupid' after word vectorization was 268 then the word embedding learned for this will change it to [0,1,0,0.7]

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [59]:
model = Sequential()

#create the embedding layer where 32 valued embedding learned for each word in our dictionary and 1 extra for any unknown word that may get encountered   
model.add(Embedding(max_features+1, 32)) #if we had a state of the art embedding for such a dataset then could have used that to save computational effort but if not then our DNN can also learn these embeddings just as well

# create LSTM layer where it will have 32 lstm units and each have activation as tanh because tensorflow accelaeration dictates lstm layer needs to have activation as tanh
# bidirectional lstm useful for sentence based inputs as words prior as well as after a specific word may provide context to the word's meaning
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))

# our output layer will be a 6 valued signmoid layer where each node can be a 0 or 1. which is how our output is configured.
model.add(Dense(6, activation = 'sigmoid'))

In [60]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam') # we didnt use our loss as something like categoricalcross entropy,etc because our output is same as running 6 independant binary classifiers

In [61]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [62]:
history = model.fit(train, epochs = 3, validation_data= val) #
the mini-batch gradient descent gets implemented inherently by our tensorflow datset utility we implemented at the start where as we pass the train data here - after each iteration the next bacth ac cessed by as_numpy_iterator() method. Actually only the first batch is passed and on each iteration the next batch is passed till the epoch gets completed
# we would have had more epochs but due to computing restraints we use only 1 epoch.

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [63]:
history.history

{'loss': [0.1956053525209427, 0.14438629150390625, 0.13146546483039856],
 'val_loss': [0.14407525956630707, 0.13660265505313873, 0.11468717455863953]}

# 5.) Making Predictions

In [64]:
input_text = vectorizer('You freaking suck!')

In [65]:
input_text #cant directly run predict on this as our model is expecting data in form of batch or a series of values so we do as below

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([   3, 3902,  100, ...,    0,    0,    0], dtype=int64)>

In [66]:
res = model.predict(np.expand_dims(input_text,0))
res  # so we can say the comment is toxic but not severelt tocix. It is obscene but not a threat, but is an insult - we keep 0.5 as our threshold for being a positive example



array([[0.9996031 , 0.44421148, 0.9864358 , 0.01112614, 0.8723909 ,
        0.10278688]], dtype=float32)

In [67]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [68]:
# Lets see if we can trigger the threat node
model.predict(np.expand_dims(vectorizer('I am going to kill and punch you to death'),0)) # so we can see we may need to train on threat examples more



array([[0.7139152 , 0.11221574, 0.40193668, 0.20424493, 0.4668107 ,
        0.27847457]], dtype=float32)

# 6.) Evaluating the model

In [69]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [70]:
# useful to use if we are processing the results of data one batch at a time and want to accumulate result for each batch
pr = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [71]:
for batch in test.as_numpy_iterator():  #loop through each batch of test data
    x_true, y_true = batch  
    yhat = model.predict(x_true)
    
    #Flatten the predictions so that they are a single vector 
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pr.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
    



In [72]:
print(f'Precision: {pr.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

# if we would have trained on entire dataset and increase our epochs then our model's performance will increase significantly

Precision: 0.8248710036277771, Recall: 0.8387631177902222, Accuracy: 0.17898832261562347


In [73]:
model.save('toxicity.h5') #saving our model to our current working directory

In [74]:
df.to_csv('data.csv')