# Loading Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Dropout,Bidirectional,Embedding,TextVectorization,GRU,BatchNormalization
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

2024-07-02 14:24:41.612438: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
!pwd

/bin/bash: /home/yasir/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/media/yasir/164381f5-80c4-42ad-b63a-fac7a881baa4/yasir/Documents/Projects/Comment Toxicity/Research


In [4]:
df=pd.read_csv("../artifacts/data_ingestion/data/train.csv")
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


# Preprocessing

In [5]:
x=df['comment_text']
y=df[df.columns[2:]].values

In [8]:
x.values

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [7]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
MAX_FEATURES=200000
vectorizer=TextVectorization(max_tokens=MAX_FEATURES,output_sequence_length=1800,output_mode='int')

In [None]:
vectorizer.adapt(x.values)
vectorized_text=vectorizer(x.values)


In [None]:
vectorized_text

In [None]:
vectorizer.get_vocabulary()

In [None]:
print(vectorized_text)
print(y.shape)


In [None]:
dataset=tf.data.Dataset.from_tensor_slices((vectorized_text,y)) 
dataset=dataset.cache()
dataset=dataset.shuffle(160000)
dataset=dataset.batch(64)
dataset=dataset.prefetch(8)

In [None]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

In [None]:
#70% batches
int(len(dataset)*.7)

In [None]:
train=dataset.take(int(len(dataset)*.7))
val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
train_generator=train.as_numpy_iterator()
train_generator.next()

# Creating Model

In [None]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 128))
# Bidirectional LSTM Layer
model.add(Bidirectional(GRU(128, activation='tanh')))
model.add(BatchNormalization())
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())

model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.summary()

In [None]:

model.compile(loss="BinaryCrossentropy",optimizer='Nadam',metrics=["CategoricalAccuracy"])

In [None]:
history=model.fit(train,epochs=20,validation_data=val)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

# Prediction

In [None]:
import numpy as np
#model=tf.keras.models.load_model("/kaggle/input/toxicity-model/toxicity.h5")

In [None]:
input_text=vectorizer("hello how are you?")

In [None]:
batch=test.as_numpy_iterator().next()
batch_X,batch_Y=test.as_numpy_iterator().next()
model.predict(batch_X)


In [None]:
result=model.predict(np.expand_dims(input_text,0))
result

# Model Evaluation

In [None]:
pre=Precision()
re=Recall()
acc=CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    x_true,y_true=batch
    yhat=model.predict(x_true)
    y_true=y_true.flatten()
    yhat=yhat.flatten()
    pre.update_state(y_true,yhat)
    re.update_state(y_true,yhat)
    acc.update_state(y_true,yhat)



In [None]:
print(f' Precision:{pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
model.save("toxicity.keras")

In [None]:
import os
os.remove("/kaggle/working/vectorizer.pkl")


# Gradio APP

In [None]:
!pip install gradio

In [None]:
!pip install numpy==1.24.0 pydantic==1.9.0

In [None]:
import gradio as gr

In [None]:
model=tf.keras.models.load_model("/kaggle/input/toxicity-model/toxicity.h5")

In [None]:
def score_comment([comment]):
    vectorized=vectorizer([comment])
    result=model.predict(np.expand_dims(vectorized,0))
    text=''
    for idx,col in enumerate(df.columns[2:]):
        text+='{}: {}\n'.format(col,result[0][idx]>0.5)
    return text

In [None]:
input_str="I will kill you, you black!"
result=score_comment(input_str)

In [None]:
interface=gr.Interface(fn=score_comment,inputs=gr.inputs.Textbox(lines=2,placeholder='comment to score'),outputs='text')

In [None]:
interface.launch(share=True)