## Import libraries

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from matplotlib import pyplot as plt
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
df = pd.read_csv('kaggle dataset/train.csv')

In [None]:
df.head()

In [None]:
df.columns

### We can see that each sentence is descriped by 6 classes 
#### 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' 
#### for each sentence 1 is added if it belongs to that class 
#### so for example the first sentence does not indicate any of the toxicity classes so 0 is added for all classes

## Create the vectorizer layer

In [None]:
# hyper params that can be tuned
max_features = 100000
max_len = 1800

vectorize_layer = tf.keras.layers.TextVectorization(
              max_tokens=max_features,
              output_mode='int',
              output_sequence_length=max_len)


## Preprocess the dataset

In [None]:
text = df['comment_text'].values

In [None]:
# help the vectorizer create dictionarey from the corpus
# to be used later in vecotrization

vectorize_layer.adapt(text)

In [None]:
labels = df[df.columns[2:]].values
print(labels)

In [None]:
#MCSHBAP : map, chache, shuffle, batch, prefetch  from_tensor_slices

dataset = tf.data.Dataset.from_tensor_slices((text, labels))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(128)
dataset = dataset.prefetch(128) # helps bottlenecks

In [None]:
def train_test_val_split(dataset,ratios = [0.7,0.2,0.1]):
    """
    splits the dataset into train , test , validation splits
    according to the given ratios list
    
    
    args:
        dataset : tensorflow dataset object that represents 
                  the whole dataset samples to be split
                  
        ratios  : a list of 3 numbers representing the percentage of samples 
                  for each split
                  
        returns train , test , val datasets
        
        example:-
        
        dataset = [1,2,3,4]
        ratios = [0.5,0.25,0.25]
        
        print(train_test_val_split(dataset,ratios))
        
        >> [1,2],[3],[4]
        
        
    """
    
    n_samples = len(dataset)
    n_train = int(ratios[0]*n_samples)
    n_test  = int(ratios[1]*n_samples)
    n_val   = int(ratios[2]*n_samples)
    
    train = dataset.take(n_train)
    val = dataset.skip(n_train).take(n_test)
    test = dataset.skip(n_train+n_test).take(n_val)
    
    return train,test,val



## Split the dataset to train-test-val

In [None]:
train_test_val_split(dataset)

## Create the model

In [None]:
model = Sequential()

model.add(vectorize_layer)
# Create the embedding layer 
model.add(Embedding(max_features+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

## Train the model

In [None]:
with tf.device('/gpu:0'):
    history = model.fit(train, epochs=10, validation_data=val)

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

## Test

In [None]:
input_text = 'I hate you'


res = model.predict(np.expand_dims(input_text,0))

(res > 0.5).astype(int)

batch_X, batch_y = test.as_numpy_iterator().next()

print((model.predict(batch_X) > 0.5).astype(int))


## Evaluation

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

## Save the model

In [None]:
model.save('toxicity_model')
