In [1]:
!pip install tensorflow pandas matplotlib scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/TRAIN/train.csv')

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
#Preprocessing

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
print(x)

0         Explanation\r\nWhy the edits made under my use...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\r\nMore\r\nI can't make any real suggestions...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \r\n\r\nThat...
159568    Spitzer \r\n\r\nUmm, theres no actual article ...
159569    And it looks like it was actually you who put ...
159570    "\r\nAnd ... I really don't think you understa...
Name: comment_text, Length: 159571, dtype: object


In [None]:
print(y)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [None]:
MAX_FEATURES=200000 #number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length = 1800, output_mode='int')

In [None]:
vectorizer.adapt(x.values) #trained vectorizer to learn our vocab

In [None]:
vectorizer("Hello world life is amazing")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([288, 263, 306, ...,   0,   0,   0])>

In [None]:
vectorized_text = vectorizer(x.values) #tokenized every single word in the vocab

In [None]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [None]:
#creating a tensorflow data pipeline
#data pipeline steps - MCSHABAP Map, Cache, SHuffle, BAtch, Prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) # divided into batches
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [None]:
batch_x, batch_y = dataset.as_numpy_iterator().next()
#batch represented as text + labels

In [None]:
print(len(dataset)) #this many number of batches now
print(len(dataset)*16) #actual number
#idea of shape
print(batch_x.shape)
print(batch_y.shape)

9974
159584
(16, 1800)
(16, 6)


In [None]:
train = dataset.take(int(len(dataset)*0.7)) #"take" that partition out as train data for training. Taking 70% data for train
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [None]:
print(len(train))
print(len(val))
print(len(test))

6981
1994
997


CREATE SEQUENTIAL MODEL

In [None]:
#embedding layer groups similar words; creates word vectors and checks the angles between them
#Personality test for the word: Words having similar scores are grouped e.g. great and good

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, concatenate

MAX_FEATURES = 10000
EMBEDDING_DIM = 100

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=MAX_FEATURES+1, output_dim=EMBEDDING_DIM))

# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=128, activation='tanh', return_sequences=True)))
model.add(Dropout(0.2))

# Convolutional layer
model.add(Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'))
model.add(GlobalMaxPooling1D())

# Dense layers
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(units=6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(model.summary())


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         1000100   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 256)        234496    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, None, 256)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, None, 64)          81984     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_8 (Dense)             (None, 128)              

In [None]:
#model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 100)         1000100   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 256)        234496    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, None, 256)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, None, 64)          81984     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_8 (Dense)             (None, 128)              

In [None]:
history = model.fit(train,epochs=5, validation_data=val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [None]:
history.history

In [None]:
batch_x, batch_y = test.as_numpy_iterator().next()

In [None]:
df.columns[2:]

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

In [None]:
#Evaluation of model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  #unpack the batch
  x_true, y_true = batch
  #make a prediction
  y_that = model.predict(x_true)

  #flatten predictions
  y_true = y_true.flatten()
  y_that = y_that.flatten()

  pre.update_state(y_true, y_that)
  re.update_state(y_true, y_that)
  acc.update_state(y_true, y_that)

NameError: ignored

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

In [None]:
#APP

In [None]:
!pip install gradio jinja2

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text = ''
  for idx, col in enumerate(df.columns[2:-1]):
    text += '{}: {}\n'.format(col, results[0][idx]>0.5)

  return text

In [None]:
interface = gr.Interface(fn=score_comment, inputs = gr.inputs.Textbox(lines=2, placeholder = 'Comment to score'), outputs = 'text')

In [None]:
interface.launch(share=True)