## 1. Install dependencies and Load raw data

In [1]:
%pip install tensorflow pandas matplotlib scikit-learn numpy

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp39-cp39-win_amd64.whl.metadata (13 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.6 MB 3.4 MB/s eta 0:00:04
   ----- ---------------------------------- 1.6/11.6 MB 4.

In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
df = pd.read_csv(os.path.join('data', 'train.csv'))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [23]:
df[df['toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [24]:
df[df.columns[2:]].iloc[42]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    1
Name: 42, dtype: int64

## 2. Preprocessing

In [25]:
from tensorflow.keras.layers import TextVectorization

In [26]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [28]:
MAX_TOKENS = 200000 #max number of words in the vocabulary

In [29]:
vectorizer = TextVectorization(max_tokens=MAX_TOKENS,
                               output_sequence_length=1800,
                                output_mode = 'int' )

In [30]:
vectorizer.adapt(X.values)

In [35]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 np.str_('the'),
 np.str_('to'),
 np.str_('of'),
 np.str_('and'),
 np.str_('a'),
 np.str_('you'),
 np.str_('i'),
 np.str_('is'),
 np.str_('that'),
 np.str_('in'),
 np.str_('it'),
 np.str_('for'),
 np.str_('this'),
 np.str_('not'),
 np.str_('on'),
 np.str_('be'),
 np.str_('as'),
 np.str_('have'),
 np.str_('are'),
 np.str_('your'),
 np.str_('with'),
 np.str_('if'),
 np.str_('article'),
 np.str_('was'),
 np.str_('or'),
 np.str_('but'),
 np.str_('page'),
 np.str_('my'),
 np.str_('an'),
 np.str_('from'),
 np.str_('by'),
 np.str_('do'),
 np.str_('at'),
 np.str_('about'),
 np.str_('me'),
 np.str_('so'),
 np.str_('wikipedia'),
 np.str_('can'),
 np.str_('what'),
 np.str_('there'),
 np.str_('all'),
 np.str_('has'),
 np.str_('will'),
 np.str_('talk'),
 np.str_('please'),
 np.str_('would'),
 np.str_('its'),
 np.str_('no'),
 np.str_('one'),
 np.str_('just'),
 np.str_('like'),
 np.str_('they'),
 np.str_('he'),
 np.str_('dont'),
 np.str_('which'),
 np.str_('any'),
 np.str_('been'),
 np

In [36]:
vectorized_text = vectorizer(X.values)

In [37]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [44]:
len(vectorized_text)

159571

In [None]:
#MCSHBAP - Map Cache Shuffle Batch Prefetch : data preprocessing steps 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache() #tf used cached data instead of recaluculating it. speeds up data loading and processing
dataset = dataset.shuffle(buffer_size=160000) #Randomly reorders elements in a dataset
dataset = dataset.batch(batch_size=16) #Groups elements of the dataset into batches of the specified size
dataset = dataset.prefetch(buffer_size=8)#Prepares the next batch of data while the current batch is being processed, enabling parallel execution.

In [56]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [59]:
batch_X.shape

(16, 1800)

In [64]:
len(dataset) #number of batches

9974

In [118]:

v = int(len(dataset)*0.9)
tes = int(len(dataset)*0.1)

In [121]:
int(len(dataset)*0.7)+int(len(dataset)*0.2)

8975

In [124]:
train = dataset.take(int(len(dataset)*0.7))
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2)+1)
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1)+1)


In [149]:
train.as_numpy_iterator().next()[0].shape

(16, 1800)

## 2. Create Sequential Model

In [136]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Embedding

In [150]:
#Architecture of our Sequential model
model = Sequential()
#Creating an Embedding Layer
model.add(Embedding(input_dim=MAX_TOKENS+1, output_dim=32))
#Birectional wrapper on LSTM layer
model.add(Bidirectional(LSTM(units=32, activation='tanh')))
#Fully Connected layers
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units= 256, activation='relu'))
model.add(Dense(units=128, activation='relu'))
#Final Layer
model.add(Dense(units=6, activation='sigmoid'))


In [None]:
#loss is binarycrossentropy as output is multioutput with each being binary(0 or 1) (not categorical in nature)
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [153]:
#Training
hist = model.fit(train, epochs=1, validation_data=val)

[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12997s[0m 2s/step - loss: 0.0824 - val_loss: 0.0444


## 3. Make Predictions

In [175]:
input_text = vectorizer('You Freaking suck! whites and blacks and browns')

In [176]:
model.predict(np.expand_dims(input_text,0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step


array([[0.99013567, 0.2752302 , 0.9700686 , 0.03304162, 0.80697864,
        0.14511776]], dtype=float32)

In [168]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [177]:
batch_test_X, batch_test_y = test.as_numpy_iterator().next()

In [178]:
model.predict(batch_test_X)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 600ms/step


array([[1.52170658e-04, 1.87292915e-09, 2.67689211e-05, 5.77633088e-08,
        1.16396104e-05, 4.85268629e-06],
       [8.69830081e-04, 8.63818528e-08, 2.04556432e-04, 1.42877946e-06,
        1.07208696e-04, 5.11155995e-05],
       [7.94709893e-04, 6.43844587e-08, 1.71462452e-04, 1.12452597e-06,
        9.11740208e-05, 4.18540330e-05],
       [4.69187566e-04, 2.35260202e-08, 1.02537662e-04, 4.69805912e-07,
        4.97273395e-05, 2.28523713e-05],
       [9.10983622e-01, 2.60119066e-02, 6.60734713e-01, 2.07979362e-02,
        5.35197258e-01, 6.41326234e-02],
       [7.96921134e-01, 1.32118054e-02, 3.83357018e-01, 2.31241453e-02,
        3.93650085e-01, 6.11428395e-02],
       [4.52401873e-04, 1.73952390e-08, 8.67776544e-05, 3.87179654e-07,
        4.37447998e-05, 1.90689389e-05],
       [1.19278170e-02, 4.68923236e-06, 1.69540604e-03, 5.60416120e-05,
        1.65563717e-03, 6.45843800e-04],
       [6.75884634e-02, 3.59855148e-05, 6.58506854e-03, 3.99880577e-04,
        9.84706637e-03, 

### Evalutation and Performance metrics

In [179]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [180]:
pre = Precision()
rec = Recall()
acc = CategoricalAccuracy()

In [184]:
for batch in test.as_numpy_iterator():
    #unpack the batch
    X_true, y_true = batch
    #make a prediction
    yhat = model.predict(X_true)

    #flatten the prediction
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    rec.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [185]:
print(f'Precison: {pre.result().numpy()}, Recall: {rec.result().numpy()}, CategoricalAccuracy: {acc.result().numpy()}')

Precison: 0.8627223372459412, Recall: 0.6282737255096436, CategoricalAccuracy: 0.48897796869277954


## Save Model

In [187]:
from tensorflow.keras.models import load_model
import os

In [188]:
model.save(os.path.join('models', 'comment_toxicity.h5'))

