## Read in Data + Install Packages

In [1]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

Collecting tensorflow
  Downloading tensorflow-2.8.0-cp38-cp38-win_amd64.whl (438.0 MB)
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.8.0-cp38-cp38-win_amd64.whl (438.0 MB)
Collecting pandas
  Downloading pandas-1.4.2-cp38-cp38-win_amd64.whl (10.6 MB)
Collecting matplotlib
  Downloading matplotlib-3.5.1-cp38-cp38-win_amd64.whl (7.2 MB)
Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.25.0-cp38-cp38-win_amd64.whl (1.5 MB)
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting keras<2.9,>=2.8.0rc0
  Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
Collecting protobuf>=3.9.2
  Downloading protobuf-3.20.1-cp38-cp38-win_amd64.whl (904 kB)
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.44.0-cp38-cp38-win_amd64.whl 

In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
df = pd.read_csv(os.path.join('toxic_speech_data','train.csv', 'train.csv'))

In [5]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


## Data Preprocessing

In [15]:
from tensorflow.keras.layers import TextVectorization

In [16]:
data = df['comment_text']
labels = df[df.columns[2:]].values

In [17]:
data.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [19]:
labels[:5]

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [23]:
print(f'Data type of labels: {type(labels)}')
print(f'Data type of data: {type(data)}')

Data type of labels: <class 'numpy.ndarray'>
Data type of data: <class 'pandas.core.series.Series'>


In [24]:
# Num of words that can be stored
MAX_FEATURES = 10000

In [59]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1000,
                               output_mode='int')

In [60]:
vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x28c06fc18b0>

In [61]:
vectorizer.adapt(data.values)

In [62]:
# Here you can see where each word is stored in our 'dictionary'
vectorizer('Hello, it is me')[:4]

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([286,  12,   9,  36], dtype=int64)>

In [63]:
# Building up vectorized dataset
vectorized_text = vectorizer(data.values)

In [34]:
vectorized_text[:5]

<tf.Tensor: shape=(5, 1500), dtype=int64, numpy=
array([[ 643,   76,    2, ...,    0,    0,    0],
       [   1,   54, 2506, ...,    0,    0,    0],
       [ 425,  440,   70, ...,    0,    0,    0],
       [  60,    8,  199, ...,    0,    0,    0],
       [   7, 1656,   20, ...,    0,    0,    0]], dtype=int64)>

In [64]:
# Tensorflow data pipeline (map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file)
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, labels))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [65]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.15))
test = dataset.skip(int(len(dataset)*.85)).take(int(len(dataset)*.15))

## Building Sequential Model

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [67]:
model = Sequential()

# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))

# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [70]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [71]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          320032    
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 6)                 774       
                                                      

In [101]:
history = model.fit(train, epochs=1, validation_data=val)

 798/6981 [==>...........................] - ETA: 34:35 - loss: 0.0802

KeyboardInterrupt: 

In [103]:
history.history

{'loss': [0.06039100140333176], 'val_loss': [0.049944598227739334]}

## Test Predictions

In [89]:
input_text = vectorizer('I hate you stupid man.')

In [90]:
res = model.predict(np.array([input_text]))

In [93]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [91]:
(res > 0.5).astype(int)

array([[1, 0, 0, 0, 1, 0]])

In [94]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [95]:
(model.predict(batch_X) > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [96]:
res.shape

(1, 6)

## Evaluation

In [97]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [98]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [99]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [100]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8020133972167969, Recall:0.3114296495914459, Accuracy:0.3803475797176361


In [104]:
model.save('demo.h5')

## Setup Gradio

In [107]:
!pip install gradio jinja2

Collecting gradio
  Downloading gradio-2.9.4-py3-none-any.whl (2.9 MB)
Collecting paramiko
  Downloading paramiko-2.10.4-py2.py3-none-any.whl (212 kB)
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting pycryptodome
  Downloading pycryptodome-3.14.1-cp35-abi3-win_amd64.whl (1.8 MB)
Collecting uvicorn
  Downloading uvicorn-0.17.6-py3-none-any.whl (53 kB)
Collecting fastapi
  Downloading fastapi-0.75.2-py3-none-any.whl (54 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp38-cp38-win_amd64.whl (555 kB)
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting orjson
  Downloading orjson-3.6.8-cp38-none-win_amd64.whl (184 kB)
Collecting frozenlist>=1.1.1
  Downloading fr

In [108]:
import gradio as gr

  "class": algorithms.Blowfish,


In [105]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [109]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [110]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7860/
Running on public URL: https://52205.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x28c1b0bb6d0>,
 'http://127.0.0.1:7860/',
 'https://52205.gradio.app')