# Cleaning

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [3]:
data = pd.read_csv("/kaggle/input/dataset/train_cleaned.csv")

data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_words
0,0000997932d777bf,explanation\n edits made username hardcore ...,0,0,0,0,0,0,"['explanation', 'edits', 'made', 'username', '..."
1,000103f0d9cfb60f,daww matches background colour im seemingly ...,0,0,0,0,0,0,"['daww', 'matches', 'background', 'colour', 'i..."
2,000113f07ec002fd,hey man im really trying edit war guy c...,0,0,0,0,0,0,"['hey', 'man', 'im', 'really', 'trying', 'edit..."
3,0001b41b1c6bb37e,cant make real suggestions improvement won...,0,0,0,0,0,0,"['cant', 'make', 'real', 'suggestions', 'impro..."
4,0001d958c54c6e35,sir hero chance remember page thats,0,0,0,0,0,0,"['sir', 'hero', 'chance', 'remember', 'page', ..."


# Tokenizing Comments using TextVectorization

In [5]:
# finding the maximum number of words present in any given comment
maxlen = 0
longest_comment = ""
for comment in data['comment_text']:
    length = len(comment)
    if (length > maxlen):
        longest_comment = comment
    maxlen = max(maxlen, length)

print("Number of characters in the longest comment is", maxlen)
print("Number of words in the longest comment is",
      len(longest_comment.split(" ")) + 1)

Number of characters in the longest comment is 5000

Number of words in the longest comment is 456


In [6]:
import tensorflow as tf

vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(

    max_tokens=None,

    # this is greater than the max words any comment has (774)
    # the remaning spots in the output would be padded by 0s
    output_sequence_length=800,

    # converets to lowercase and skips all the punctuation
    standardize="lower_and_strip_punctuation",

    # the tokens will be split at whitespaces
    split="whitespace",

    # each of the tokens is represented as an integer
    output_mode="int",
)

In [7]:
numpyArray = data[data.columns[1]].to_numpy()
vectorize_layer.adapt(numpyArray)

In [8]:
# testing
vectorize_layer("hello, world!")

<tf.Tensor: shape=(800,), dtype=int64, numpy=
array([185, 161,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  

# Creating a Tokenizer

### Gets all the words into one string

In [10]:
# Creates the tokenizer class
tokenizer = keras.preprocessing.text.Tokenizer()

# Combines all the words into one singular string
allWordsString = " ".join(data.head(50)["comment_text"].tolist())
allWordsList = allWordsString.split(r"\\s+")

# Updates the tokenizer with the string of all words
tokenizer.fit_on_texts(allWordsList)

# Prints word dictionary
# print(tokenizer.word_index)

# Prints length of word dictionary
print(len(tokenizer.word_index))

# Converts text to numbers
print(tokenizer.texts_to_sequences(["page", "im", "use", "mussolini"]))

1077
[[1], [2], [3], [1077]]


# Text Vectorization Layer
### Following [https://www.tensorflow.org/text/tutorials/text_classification_rnn](http://)

In [51]:
NUM_ROWS = 10000

MAX_LENGTH = None
encoder = tf.keras.layers.TextVectorization(output_sequence_length=MAX_LENGTH)
encoder.adapt(data.head(NUM_ROWS)["comment_text"].tolist())

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'article', 'page', 'wikipedia', 'would', 'talk',
       'like', 'one', 'please', 'dont', 'see', 'also', 'im', 'know',
       'think', 'edit', 'people', 'use', 'articles'], dtype='<U322')

### Testing the encoder

Encoder removes punctuation and whitespace and forces lowercase so half the cleaning we did was useless

In [6]:
commentsToEncode = data.head(3)["comment_text"]
print(commentsToEncode)

encodedComments = encoder(commentsToEncode).numpy()
print(encodedComments)

for comment in encodedComments:
    print(" ".join(vocab[comment]))

0    explanation\n  edits made   username hardcore ...
1    daww  matches  background colour im seemingly ...
2    hey man im really  trying  edit war     guy  c...
Name: comment_text, dtype: object
[[ 120   54   76  484 1020  867 1094  148  465  483 1273 1047  476  320
   438 1168 1104    7    8  104  258    5    2   67    3  697]
 [1211  885 1346 1267    3  661  263   29    5  955   90    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [ 355   75    3  106   62   13  472 1028  392  293  294   49  546   54
   976    5    2  284 1301  368 1421  342    0    0    0    0]]
explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired
daww matches background colour im seemingly stuck thanks talk january utc               
hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting

### Building the model

In [52]:
# Sets random seed so results are identical every time
SEED = 1
tf.random.set_seed(SEED)
np.random.seed(SEED)

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])


### Training the model

In [53]:
binaryDf = data.head(NUM_ROWS)[["comment_text", "toxic"]]

split_cutoff = int(0.8 * NUM_ROWS)
training_data = binaryDf.iloc[:split_cutoff]
validation_data = binaryDf.iloc[split_cutoff:]

training_target = training_data.pop("toxic")
validation_target = validation_data.pop("toxic")

In [54]:
# # training_dataset = tf.convert_to_tensor(training_data)
# # validation_dataset = tf.convert_to_tensor(validation_data)

# validation_target = validation_data.pop("toxic")
# validation_dataset = tf.convert_to_tensor(validation_data)

# # history = model.fit(training_dataset, target, epochs=10, validation_data=(validation_dataset, validation_target))

# binaryDf = data.head(NUM_ROWS)[["comment_text", "toxic"]]
# target = binaryDf.pop("toxic")
# dataset = tf.convert_to_tensor(binaryDf)

history = model.fit(training_data, training_target, epochs=10, validation_data=(validation_data, validation_target))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [1]:
# sample_text = ""
# predictions = model.predict(np.array([sample_text]))
# predictions[0]

### Getting test data

In [8]:
test_data = pd.read_csv("/kaggle/input/toxic-competition-dataset/test.csv")
test_labels = pd.read_csv("/kaggle/input/toxic-competition-dataset/test_labels.csv")

test_labels = test_labels.loc[test_labels["toxic"] >= 0]
merged_df = test_labels.merge(test_data, left_on="id", right_on="id")

In [55]:
# Tests all rows with a value of 0 or 1

test_df = merged_df[["comment_text", "toxic"]]
testTarget = test_df.pop("toxic")
model.evaluate(test_df, testTarget)



[0.29684823751449585, 0.9280377626419067]

In [56]:
# Tests only rows with a toxic value of 1

test_df = merged_df[["comment_text", "toxic"]]
newTest_df = test_df.loc[test_df["toxic"] == 1]

newTestTarget = newTest_df.pop("toxic")
model.evaluate(newTest_df, newTestTarget)



[1.8954060077667236, 0.5660098791122437]