In [19]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.layers import TextVectorization, LSTM, Bidirectional, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential, Model
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("train.csv")

In [7]:
vector = TextVectorization(max_tokens=200000,output_sequence_length=1800,output_mode="int")

In [8]:
X = df["comment_text"]
y = df[df.columns[2:]].values

In [12]:
vector.adapt(X.values)
vectordata = vector(X.values)

In [34]:
dataset = (
    tf.data.Dataset.from_tensor_slices((vectordata, y))
    .shuffle(100000)
    .batch(256)
    .prefetch(128)
)

In [38]:
train_dataset = dataset.take(int(len(dataset)*.6))
val_dataset = dataset.skip(int(len(dataset)*6)).take(int(len(dataset)*3))
test_dataset = dataset.skip(int(len(dataset)*9)).take(int(len(dataset)*1))

In [21]:
def layer_architecture():
    model = Sequential()
    model.add(Embedding(200000+1,32))
    model.add(Bidirectional(LSTM(32,activation="tanh")))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(256, activation="relu"))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(6, activation="sigmoid"))
    return model

In [22]:
model = layer_architecture()

In [23]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=0.001)

In [26]:
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(opt=optimizer, model=model)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    save_best_only=True,
    monitor='accuracy',
    mode='min',
    verbose=1
)

In [None]:
model.fit(
    train_dataset,
    epochs=1,
    verbose=1,
    validation_data = val_dataset,
    callbacks=[checkpoint_callback]
)

In [None]:
text = vector("Have a good day")
pred = model.predict(np.expand_dims(text,0))
predictions = (pred > 0.5).astype(int)
categories = list(df.columns[2:])
predicted_categories = [categories[i] for i in range(predictions.shape[1]) if predictions[0, i] == 1]
if (predicted_categories != []):
  for i in predicted_categories:
    print(i.capitalize())
else:
  print("No toxicity found")

In [47]:
model.save("toxicity-checker.h5")