# Getting the tokenizer right

How many instances of each type of toxicity?

In [1]:
from collections import Counter
import pandas as pd, numpy as np

train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
train['score'] = train['toxic'] \
                + train['severe_toxic'] \
                + train['threat'] \
                + train['insult'] \
                + train['obscene'] \
                + train['identity_hate']

Counter(train['score'])

Counter({0: 143346, 1: 6360, 2: 3480, 3: 4209, 4: 1760, 5: 385, 6: 31})

Eyeball the most toxic:

In [2]:
most_toxic = train.loc[train['score'] >= 6]

print(most_toxic['comment_text'])

1017      WOULDN'T BE THE FIRST TIME BITCH. FUCK YOU I'L...
1312      SHUT UP, YOU FAT POOP, OR I WILL KICK YOUR ASS!!!
7299      You're a stupid cunt \n\nFuck you dumb arse, y...
13648     Bitch \n\nYou are a little bitch. I fuckin spe...
13964     I am going to murder ZimZalaBim ST47 for being...
22158     FUCK YOU!!!!!!!!!!!! YOU FUCKING NIGGER BAG OF...
29968     u motherfukkin bitch i want to rape you smelly...
32098     Fuck All Asyriac Nation \n\nQamishli belong to...
33951     GO FUCK YOURSELF BITCH.  I HATE YOUR SOULD.  M...
38513     AM GOING TO RAPE YOU IN THE ASS YOU FAT BITCH ...
38578     fuck you honkey, why you hatin' on blacks? You...
46155           So fuck off and die, rape fantasising sluts
55160     ANYONE WHO SUPPORTS THIS IS FUCKING SICK. MEN ...
59279     JEW \n\nGet the fuck out of here you jewish so...
65075     FAGGOTS!  YO FUCKER IT WAS FUCKING HAYES YOU F...
67247     YOURE A FRREAKING JEW, AND PLEAZE COMMIT SUICIDE.
73821     Shut up you asswipe, we don't 

Keras default tokenizer does things like removing '!' and lower-casing everything. This will lose some of the signal we're trying to detect. Let's look at the vocabulary of the most toxic comments... 

In [3]:
from collections import Counter
from __future__ import print_function
from pprint import pprint

sall = ' '.join(list(most_toxic['comment_text']))
sall_words = sall.split()
d = Counter(sall_words)

pprint(sorted(d.items(), key=lambda i: i[1], reverse=True))

[('DIE', 720),
 ('DI', 90),
 ('EDIE', 90),
 ('you', 61),
 ('YOU', 48),
 ('I', 38),
 ('and', 32),
 ('a', 30),
 ('IN', 29),
 ('your', 25),
 ('the', 21),
 ('of', 21),
 ('A', 18),
 ('to', 16),
 ('fuck', 16),
 ('THE', 16),
 ('will', 16),
 ('RAPE', 14),
 ('ASS', 14),
 ('FAT', 14),
 ('TO', 14),
 ('fucking', 14),
 ('BITCH', 13),
 ('GOING', 13),
 ('AM', 13),
 ('WHEELCHAIRI', 12),
 ('FUCKING', 11),
 ('that', 10),
 ('i', 10),
 ('me', 8),
 ('my', 8),
 ('rape', 8),
 ('ass', 8),
 ('fuckin', 8),
 ('get', 8),
 ('if', 8),
 ('on', 8),
 ('so', 7),
 ('is', 7),
 ('You', 7),
 ('off', 7),
 ('just', 7),
 ('shit', 7),
 ('YOUR', 6),
 (',', 6),
 ('bitch', 6),
 ('do', 6),
 ('it', 6),
 ('for', 6),
 ("I'll", 6),
 ('all', 5),
 ('WILL', 5),
 ('FUCK', 5),
 ("don't", 5),
 ('little', 5),
 ('vagina', 5),
 ('are', 5),
 ('kill', 5),
 ('like', 5),
 ('you,', 5),
 ('up', 5),
 ('an', 5),
 ('this', 4),
 ('want', 4),
 ('out', 4),
 ('find', 4),
 ('hope', 4),
 ('piss', 4),
 ('AND', 4),
 ('in', 4),
 ('who', 4),
 ('swear', 4),
 ('so

In [20]:
from random import sample

s = sample(train['comment_text'],1)[0]

print(s)

Hopffer Wiki (hotmail reminder) Weiker (klug page)= illustrator


The normalising function normalise() has been pushed into the file helpers.py:

In [21]:
from helpers import normalise
    
normalise(s)

'hopffer wiki hotmail reminder weiker klug page illustrator'

How much does this normalisation reduce the dictionary?

In [12]:
wall = ' '.join(train['comment_text']).split()
wall_set = set(wall)
print("Distinct comment words:", len(wall_set))

pall = [normalise(w) for w in wall_set]
print("Distinct normalised words:", len(set(pall)))

Distinct comment words: 533185
Distinct normalised words: 254857


Tokenisation under Keras will then normalise down further by removing tail words entirely (up to frequency set as 'num_words'):

In [13]:
from keras.preprocessing.text import Tokenizer

num_words = 10000
filters = '"#$%&()*+,-./:;<=>@[\\]^`{|}~\t\n' # keep '!?_'

train['proc_text'] = [normalise(s) for s in train['comment_text']]
list_sentences_train = train['proc_text']

tokenizer = Tokenizer(num_words=num_words, filters=filters) 
tokenizer.fit_on_texts(list(list_sentences_train))

Using TensorFlow backend.


In [31]:
import sys
from collections import Counter

tok_words = {}
all_word_ct = Counter(wall)
tok_word_ct = Counter()

itr = 0

for w in wall_set:
    wnorm = normalise(w)
    itr += 1
    if itr % 1000 == 0: 
        sys.stdout.write('{0}\r'.format(itr))
        sys.stdout.flush()
    a = tokenizer.texts_to_sequences([wnorm])[0]
    b = wnorm.split()
    n = len(a)
    if a>0:
        for j in range(n):
            tok_words[b[j]] = a[j] 
            tok_word_ct[b[j]] += all_word_ct[w]
    
print("Distinct tokenised:", len(tok_words.keys()))

Distinct tokenised: 19813


Finally, dump the <tt>tok_words</tt> dict to a json file for later use (notebook 4). 

In [36]:
import json

with open('models/tok_words.txt', 'w') as file:
     file.write(json.dumps(tok_words))
        
with open('models/tok_word_ct.txt', 'w') as file:
     file.write(json.dumps(dict(tok_word_ct)))