In [None]:
#importing libraries

import numpy as np
import pandas as pd
import matplotlib as plt
import tensorflow as tf

In [None]:
#setup GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
#gpus = tf.config.experimental.list_physical_devices('GPU')
#if gpus:
#    try:
#        # Currently, memory growth needs to be the same across GPUs
#       for gpu in gpus:
#            tf.config.experimental.set_memory_growth(gpu, True)
#        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
#        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
#    except RuntimeError as e:
#        # Memory growth must be set before GPUs have been initialized
#        print(e)
#

### Explore / Analyse data

In [None]:
#reading files

train_data = pd.read_csv("data/train.csv")
train_data

# the column is not being displayed totally, so we will increase the width

In [None]:
train_data.iloc[2]['comment_text']

In [None]:
train_data[train_data.columns[2:]].iloc[2]

# This shows that the following comment is not toxic at all

In [None]:
#display full comment by increasing the column width using pandas
pd.set_option('display.max_colwidth', None)

In [None]:
#recheck the data with increased column width
train_data

In [None]:
#retrieve the information
train_data.info()
train_data.describe()

In [None]:
#now lets check first 5 data with toxic or hate comments and LAUGH a BIT haha
train_data[train_data['toxic'] == 1].head(5)

In [None]:
#random sampling using pd.sample(n, frac,....)
train_data[train_data['toxic'] == 1].sample(5)

### Process the Data using Tokenizing

In [None]:
# Using textvectorization for natural language, otherwise for normal string, use StringLookup
from tensorflow.keras.layers import TextVectorization

TextVectorization??

#This layer has basic options for managing text in a Keras model. It transforms
#a batch of strings (one example = one string) into either a list of token
#indices (one example = 1D tensor of integer token indices) or a dense
#representation (one example = 1D tensor of float values representing data
#about the example's tokens). This layer is meant to handle natural language
#inputs. To handle simple string inputs (categorical strings or pre-tokenized
#strings) see `tf.keras.layers.StringLookup`.

In [None]:
X = train_data['comment_text'] # comments
y = train_data[train_data.columns[2:]].values #Features
X

In [None]:
y #numpy arrays to be passed onto the model for training

In [None]:
# Specifying number of words for vectorization
MAX_FEATURES = 200000

In [None]:
vectorize = TextVectorization(max_tokens = MAX_FEATURES, output_mode = 'int', output_sequence_length = 1800)

#inputs(total number of words, output_type, max input length)

In [None]:
type(X)

In [None]:
vectorize.adapt(X.values) # passing the comments as numpy array to the model using X.values

In [None]:
# lets check the first 50 generated vocabs from the Text vectorization
print(vectorize.get_vocabulary()[:50])

In [None]:
# basic data_test to check the location in the vector
vectorize("Nice, what are you doing?")[:5]

#check the 20th word in the upper section "are", the following is stored in the Tensor dsiplayed below 
#"520,40, 20, 7, 273"

### Creating the data for training

In [None]:
# Now vrctorizing the entire X values
Vector_text = vectorize(X.values)
Vector_text

In [None]:
#Pre-shuffle the data and setting the batch size for training

#MCSHBAP - map, cache, shuffle, batch, prefetch from tensor_slices, list_files -> instantiating the data pipeline
data = tf.data.Dataset.from_tensor_slices((Vector_text, y))
data = data.cache()
data = data.shuffle(160000)
data = data.batch(24)
data = data.prefetch(8) # helps prevent bottlenecks
data

In [None]:
data.as_numpy_iterator().next()
# first array is text in vectorized format.
# 2nd array are the labels 

In [None]:
X_batch, Y_batch = data.as_numpy_iterator().next()

### Split data into train, test, split

In [None]:
int(len(data)*.7) # partitioning

In [None]:
print(X_batch.shape, Y_batch.shape)

# 70%
train = data.take(int(len(data)* .7))
valid = data.skip(int(len(data)* .7)).take(int(len(data)* .2))
test = data.skip(int(len(data)* .7)).skip(int(len(data)* .2)).take(int(len(data)* .1))

print(f'The training sample is {len(train)}, Valid Samples is {len(valid)}, Test Set is {len(test)}')

In [None]:
train_generator = train.as_numpy_iterator()


In [None]:
train_generator.next() # use this to see how the model learns batchwiswe (run again and again)

##  HOW IT WORKS:
Go through a batch -> forward pass -> backward pass -> update the gradients -> next batch [.next()]

## Build Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
def predictor():
    model = Sequential()
    model.add(Embedding(MAX_FEATURES+1, 32, input_length = 1800))
    
    #GPU acceleration needed for LSTM layer should be tanh, defined by TENSORFLOW!!
    model.add(Bidirectional(LSTM(32, activation = 'tanh'))) 
    # Bidirectional useful for NLP task for passing information both ways
    
    model.add(Dense(64, activation = 'elu'))
    model.add(Dense(128, activation = 'elu'))
    model.add(Dense(256, activation = 'elu'))
    model.add(Dense(128, activation = 'elu'))
    
    #final_layer (24, 1800) (24, 6) -> the number of output as labels "toxic, sever_toxic,......."
    model.add(Dense(6, activation = 'sigmoid'))
    model.compile(loss= 'BinaryCrossentropy', optimizer= 'Adam', metrics = ['acc'])
    
    return model

In [None]:
model = predictor()
model.summary()

In [None]:
#mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
#mirrored_strategy = tf.distribute.MirroredStrategy()
strategy = tf.distribute.MultiWorkerMirroredStrategy()


In [None]:
model.fit??

In [None]:
from tensorflow import keras
#checkpoint = keras.callbacks.ModelCheckpoint('ToxicPredict.h5', save_best_only = True)
history = model.fit(train, epochs = 10, validation_data= valid, verbose = 1) 

In [None]:
"""
import pickle
with open('./first_history', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

with open('./first_history', "rb") as file_pi:
    history = pickle.load(file_pi)
"""

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
model.save('toxic_predict.h5')

### predictions

In [None]:
input_text = vectorize("Are you Fucking Idiot") # Basic check.
print(input_text)

In [None]:
# model.predict(np.array[input_text])
import tensorflow as tf

model = tf.keras.models.load_model("toxic_predict.h5")
model.predict(np.expand_dims(input_text,0))

### Test on TestSet

In [None]:
batch_x, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_x) > 0.5).astype(int)

### Model Evaluation

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    x_true, y_true = batch
    yhat = model.predict(x_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    
    pre.update_state(y_true, yhat)
    rec.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision  :{pre.result().numpy()}')
print(f'Recall  :{rec.result().numpy()}')
print(f'Accuracy  :{acc.result().numpy()}')

### USER INTERFACE

In [None]:
pip install --upgrade h11


In [None]:
import tensorflow as tf

model = tf.keras.models.load_model("toxic_predict.h5")

In [None]:
# good text check
text_check = vectorize('I will kill you without a doubt')

res = model.predict(np.expand_dims(text_check, 0))
print(res > 0.5)

train_data.columns[2:]

In [None]:
#Toxic check
hate_check = vectorize('I will kill you and your family! you suck')
res = model.predict(np.expand_dims(hate_check, 0))
print(res)
print(res>0.5)
train_data.columns[2:]

In [None]:
train_data.columns[2:]

In [None]:
def comment_interface(comment):
    vectorize_comment = vectorize([comment])
    results = model.predict(vectorize_comment)
    
    text = ''
    for idx, col in enumerate(train_data.columns[2:]):
        text += '{}: {}\n'. format(col, results[0][idx]>0.5)
    return text

In [None]:
interface = gr.Interface(fn = comment_interface,
                        inputs = gr.inputs.Textbox(lines = 2, placeholder = 'write comment'),outputs = 'text')

interface.launch(share = True)

In [None]:
import gradio as gr

