## Toxicity Detection Using Python

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import operator
import math
from functools import reduce
from sklearn.model_selection import train_test_split

### Import Dataset

In [2]:
df = pd.read_csv('toxic_data_mid.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,severe_toxic,obscene,threat,insult,identity_hate,toxic
0,dd1ce3911c27f18f,your ga nomination of grey s anatomy season ...,0.0,0.0,0.0,0.0,0.0,0.0
1,04b92907a0db6e77,evan blass update i have made the following c...,0.0,0.0,0.0,0.0,0.0,0.0
2,9a70f2c3b9d16f5b,dear federico when i have the time and i p...,0.0,0.0,0.0,0.0,0.0,0.0
3,25fe5bd99e7f4ffd,i don t think you people get it metalcore is ...,0.0,0.0,0.0,0.0,0.0,0.0
4,bdfd40a86c88abe1,final point i was planning on moving the tabl...,0.0,0.0,0.0,0.0,0.0,0.0


### Process Data

In [4]:
stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now']
maxDictionaryLength = 8000



def tokenize(sentence, isCreateDict=False):
    tmpTokens = sentence.lower().split()
    tokens = [token for token in tmpTokens if ((token not in stopwords) and (len(token)> 0)) ]
    #tokens = tmpTokens.filter((token) => !stopwords.includes(token) && token.length > 0);
    
    if isCreateDict:
        for token in tokens:
            if token in dictionary_dict:
                dictionary_dict[token] += 1
            else:
                dictionary_dict[token] = 1
    documentTokens.append(tokens)
    return tokens


def getInverseDocumentFrequency(documentTokens, dictionary):
    return list(map(lambda word : 1 + math.log(len(documentTokens) / reduce(lambda acc,curr: (1 if (word in curr) else 0) + acc, documentTokens,0)),dictionary))


  
def encoder(sentence, dictionary, idfs):
    tokens = tokenize(sentence)
    tfs = getTermFrequency(tokens, dictionary)
    tfidfs = getTfIdf(tfs,idfs)
    return tfidfs


def getTermFrequency(tokens, dictionary):
    return  list(map(lambda token: reduce(lambda acc,curr : (acc + 1 if (curr == token) else acc), tokens,0), dictionary))



def getTfIdf(tfs, idfs):
    return [tf * idf for (tf,idf) in zip(tfs,idfs)]



In [5]:
# Sample Test Code used in the slides ( Module : preparing data for machine learning model )
dictionary_dict = {}
documentTokens = []
testComments = ['i loved the movie', 'movie was boring']

for comment in testComments:
    documentTokens.append(tokenize(comment,True))


dictionary = sorted(dictionary_dict, key=dictionary_dict.get, reverse=True)
idfs = getInverseDocumentFrequency(documentTokens, dictionary);

tfidfs = []

for comment in testComments:
    tfidfs.append(encoder(comment, dictionary, idfs))

print(dictionary_dict)
print(dictionary)
print(idfs)
print(tfidfs)

{'loved': 1, 'movie': 2, 'boring': 1}
['movie', 'loved', 'boring']
[1.0, 1.6931471805599454, 1.6931471805599454]
[[1.0, 1.6931471805599454, 0.0], [1.0, 0.0, 1.6931471805599454]]


In [6]:
dictionary_dict = {}
documentTokens = []
df['tokens'] = df['comment_text'].apply(lambda x : tokenize(x, True))

In [7]:
df.head()

Unnamed: 0,id,comment_text,severe_toxic,obscene,threat,insult,identity_hate,toxic,tokens
0,dd1ce3911c27f18f,your ga nomination of grey s anatomy season ...,0.0,0.0,0.0,0.0,0.0,0.0,"[ga, nomination, grey, anatomy, season, articl..."
1,04b92907a0db6e77,evan blass update i have made the following c...,0.0,0.0,0.0,0.0,0.0,0.0,"[evan, blass, update, made, following, comment..."
2,9a70f2c3b9d16f5b,dear federico when i have the time and i p...,0.0,0.0,0.0,0.0,0.0,0.0,"[dear, federico, time, promise, ll, make, time..."
3,25fe5bd99e7f4ffd,i don t think you people get it metalcore is ...,0.0,0.0,0.0,0.0,0.0,0.0,"[think, people, get, metalcore, genre, incorpo..."
4,bdfd40a86c88abe1,final point i was planning on moving the tabl...,0.0,0.0,0.0,0.0,0.0,0.0,"[final, point, planning, moving, tables, artic..."


In [8]:

dictionary = sorted(dictionary_dict, key=dictionary_dict.get, reverse=True)
dictionary = dictionary[:maxDictionaryLength]
print('Length of dictionary : {0}'.format(len(dictionary)))
print(dictionary[:10])

Length of dictionary : 8000
['fuck', 'article', 'u', 'bitch', 'es', 'page', 'wikipedia', 'talk', 'please', 'like']


In [9]:
idfs = getInverseDocumentFrequency(documentTokens, dictionary)
len(idfs)

8000

In [10]:
df['features'] = df['comment_text'].apply(lambda x : encoder(x,dictionary, idfs))
df['features'].head()

0    [0.0, 10.12990748519189, 0.0, 0.0, 0.0, 0.0, 0...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 8.078458564119455, 0...
2    [0.0, 0.0, 4.575550768806933, 0.0, 0.0, 0.0, 2...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 2.6928195213731514, ...
4    [0.0, 2.5324768712979724, 0.0, 0.0, 0.0, 0.0, ...
Name: features, dtype: object

In [11]:
df_new = df['features'].apply(lambda x : pd.Series(x))
df_new['toxic'] = df['toxic']

### Train Test Split

In [12]:
train, test = train_test_split(df_new, test_size=0.3)
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

630 train examples
70 validation examples
300 test examples


In [13]:
train.shape, test.shape, val.shape

((630, 8001), (300, 8001), (70, 8001))

In [14]:
def df_to_dataset(dataframe, shuffle=True, batch_size=16):
    dataframe = dataframe.copy()
    labels = dataframe.pop('toxic')
    ds = tf.data.Dataset.from_tensor_slices((dataframe.values, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [15]:
batch_size = 16 
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [16]:
numOfFeatures = len(dictionary)

### Build Model

In [17]:
def get_compiled_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(5, activation='relu', input_shape=(numOfFeatures,)),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.06),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
    return model

In [18]:
model = get_compiled_model()
model.summary()
model.fit(train_ds,epochs=20 ,validation_data=val_ds)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 5)                 40005     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6         
Total params: 40,011
Trainable params: 40,011
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fe64693e7f0>

### Evaluate Model

In [19]:
model.evaluate(test_ds)



[0.7014526724815369, 0.9100000262260437]

### Make Predictions

In [20]:
## make predictions
testComments = ['you suck', 'you are a great person']
tfidfs = []
for comment in testComments:
    tfidfs.append(encoder(comment, dictionary, idfs))
print(f'predicted probabliities : {model.predict(tfidfs)}')
print(f'predicted classes : {tf.round(model.predict(tfidfs))}')

predicted probabliities : [[9.984712e-01]
 [4.631444e-20]]
predicted classes : [[1.]
 [0.]]


### Export Model

In [21]:
model.save('toxicity_python.h5')

### Model Converted

In [22]:
!rm -rf tfjs_python_toxicity
!mkdir tfjs_python_toxicity

In [23]:
!tensorflowjs_converter --input_format=keras toxicity_python.h5 tfjs_python_toxicity

In [24]:
# write dictionary and IDFs 

import json 

with open('tfjs_python_toxicity/dictionary.json', 'w') as outfile:
    json.dump(dictionary, outfile)

with open('tfjs_python_toxicity/idfs.json', 'w') as outfile:
    json.dump(idfs, outfile)
    