# Install Dependencies and Bring in Data

In [52]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [53]:
# !pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [54]:
import os                  # To deal file path
import pandas as pd        # Read tabular data
import tensorflow as tf    # For deep learning
import numpy as np         # Array manipulation

In [4]:
df = pd.read_csv(os.path.join('jigsaw_toxic_comment','train.csv', 'train.csv'))
# df = pd.read_csv("/content/gdrive/MyDrive/Comment_Toxicity/jigsaw_toxic_comment/train.csv/train.csv")

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Preprocess

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
X = df["comment_text"]
y = df[df.columns[2:]].values

In [8]:
# number of words in the vocabulory
MAX_FEATURES = 200000

In [9]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_sequence_length=1800, 
                               output_mode='int')

In [10]:
# .values return it as numpy array
vectorizer.adapt(X.values)

In [11]:
# Punctuation will be removed
vectorizer("Hello world, I am great !")[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([286, 261,   8,  74, 275], dtype=int64)>

In [12]:
vectorized_text = vectorizer(X.values)

In [13]:
len(vectorized_text), len(X)

(159571, 159571)

In [14]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

In [15]:
# MCSHBAP - Map, Chache, Shuffle, Batch, Prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)       # helps to prevent bottlenecks

# The bottleneck in a neural network is just a layer with fewer neurons than the layer below or above it. 
# Having such a layer encourages the network to compress feature representations 
# (of salient features for the target variable) to best fit in the available space. 

In [16]:
# Numpy iterator gives us the text
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [17]:
batch_y.shape, batch_X.shape

((16, 6), (16, 1800))

In [18]:
len(dataset), int(len(dataset)*0.7)

(9974, 6981)

In [19]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [20]:
len(train), len(val), len(test)

(6981, 1994, 997)

# Create Sequential Model

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [22]:
model = Sequential()                                      # The base of NN

model.add(Embedding(MAX_FEATURES+1, 32))                  # Create the embedding layer 
  
model.add(Bidirectional(LSTM(32, activation='tanh')))     # Bidirectional LSTM Layer. tanh default for LSTM
  
model.add(Dense(128, activation='relu'))                  # Feature extractor Fully connected layers
model.add(Dense(256, activation='relu'))  
model.add(Dense(128, activation='relu'))  
  
model.add(Dense(6, activation='sigmoid'))                 # Final layer, 6 neurons due to ouypuy y shape.

In [23]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [25]:
# model.fit(train, epochs=1, validation_data=val)

In [26]:
# model.save('toxicity_1_epoch.h5')
# model.save('/content/gdrive/MyDrive/Comment_Toxicity/toxicity_1_epoch.h5')

In [27]:
model = tf.keras.models.load_model('toxicity_1_epoch.h5')
# model = tf.keras.models.load_model('/content/gdrive/MyDrive/Comment_Toxicity/toxicity_1_epoch.h5')

# 3. Make Predictions

In [28]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [29]:
res = model.predict(np.expand_dims(input_text,0))



In [30]:
res

array([[0.9393882 , 0.23571797, 0.7556582 , 0.08168072, 0.6369517 ,
        0.2089775 ]], dtype=float32)

In [31]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [32]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [33]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [34]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [35]:
res.shape

(1, 6)

In [36]:
res

array([[0.9393882 , 0.23571797, 0.7556582 , 0.08168072, 0.6369517 ,
        0.2089775 ]], dtype=float32)

In [37]:
res.flatten()

array([0.9393882 , 0.23571797, 0.7556582 , 0.08168072, 0.6369517 ,
       0.2089775 ], dtype=float32)

# Evaluate Model

In [38]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [39]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
print(f'Precision: {precision.result().numpy()}, Recall:{recall.result().numpy()}, Accuracy:{accuracy.result().numpy()}')

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    
    # Make prediction
    yhat = model.predict(X_true)
    
    # Flatten in a one vector
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)

In [41]:
# To get better performance, run for more epochs. We only ran for one epoch.

# Test and Gradio

In [42]:
# !pip install gradio jinja2

In [43]:
import gradio as gr

In [55]:
import tensorflow as tf
model = tf.keras.models.load_model("toxicity_1_epoch.h5")
# model = tf.keras.models.load_model("/content/gdrive/MyDrive/Comment_Toxicity/toxicity_1_epoch.h5")

In [56]:
input_str = vectorizer('hey i kill hate you!')

In [57]:
res = model.predict(np.expand_dims(input_str,0))



In [58]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [59]:
(res > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0]])

In [60]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [61]:
import warnings
warnings.filterwarnings('ignore')
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [62]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://475e183517e16ee6.gradio.app

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x1f447757640>,
 'http://127.0.0.1:7861/',
 'https://475e183517e16ee6.gradio.app')

