### Install Dependencies and importing them

In [360]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
df = pd.read_csv(os.path.join("jigsaw-toxic-comment-classification-challenge","train.csv"))
df.head()

### Preprocessing Data

In [362]:
from tensorflow.keras.layers import TextVectorization

In [363]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
y

In [None]:
X

In [366]:
MAX_FEATURES = 160000

In [367]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES, output_sequence_length=1800, output_mode='int')

In [368]:
vectorizer.adapt(X.values)

In [None]:
vectorizer.get_vocabulary()

In [370]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text

In [372]:
#MCSHBAP - map, chache, shuffle, batch, prefetch from tensor_slices, list_file

dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [373]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [None]:
batch_y.shape

In [375]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2))
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [None]:
dataset

In [None]:
train.as_numpy_iterator().next()

In [None]:
val.as_numpy_iterator()

### 2. Create Sequential Model

In [379]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:

# Example value for MAX_FEATURES and input length
MAX_FEATURES =160000  # Set based on your dataset's vocabulary size
INPUT_LENGTH = 1800    # Length of input sequences (e.g., number of words in a sentence)

model = Sequential()

# Create the embedding Layer
model.add(Embedding(input_dim=MAX_FEATURES + 1, output_dim=32))

# Create the Bidirectional LSTM layer
model.add(Bidirectional(LSTM(32)))  # No need to specify activation, default is 'tanh'

# Add Dense layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))  # Output layer for multi-class classification

# Compile the model (if you're ready to train)
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

model.build()

# Display model summary to check if it builds successfully
model.summary()


In [None]:
history = model.fit(train, epochs = 1, validation_data= val)

In [None]:
history.history

In [383]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(8,6))
pd.DataFrame(history.history).plot()
plt.show()

### Make Predictions

In [385]:
input_text = vectorizer("You freaking suck!")

In [None]:
res = model.predict(np.expand_dims(input_text, 0))

In [None]:
print(res)

In [388]:
batch_X, batch_y= test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

### Evaluating Model

In [390]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [391]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch

    # Make a prediction
    yhat = model.predict(X_true) 

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

### Test and Gradio

In [None]:
import tensorflow as tf

In [394]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [398]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
df.columns[2:]

In [None]:
res > 0.5

In [412]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx] > 0.5)
    return text

In [None]:
score_comment("I am going to kill you")

In [None]:


# Define the interface
interface = gr.Interface(
    fn=score_comment, 
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'), 
    outputs=gr.Textbox()
)

# Launch the interface
interface.launch(share=True)
