In [2]:
import csv
import numpy as np
import pandas as pd
from copy import deepcopy
from string import punctuation
from random import shuffle
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
import nltk


import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

filename = '../data/toxicity_only.csv'
df = pd.read_csv(filename)

Using TensorFlow backend.


### cleaning data

In [3]:
train = df.sample(frac=0.9, random_state=200)
test = df.drop(train.index)

In [4]:
X_train = train.drop([
    'target',
    'severe_toxicity',
    'obscene',
    'identity_attack',
    'insult', 
    'threat',
    'toxicity_annotator_count'
],axis = 1)
X_test = test.drop([
    'target',
    'severe_toxicity',
    'obscene',
    'identity_attack',
    'insult', 
    'threat',
    'toxicity_annotator_count'
],axis = 1)
y_train = train.drop([
    'comment_text',
    'severe_toxicity',
    'obscene',
    'identity_attack',
    'insult', 
    'threat',
    'toxicity_annotator_count'
],axis = 1)
y_test = test.drop([
    'comment_text',
    'severe_toxicity',
    'obscene',
    'identity_attack',
    'insult', 
    'threat',
    'toxicity_annotator_count'
],axis = 1)

In [5]:
X_train['comment_text'] = X_train['comment_text'].progress_map(lambda x: tokenizer.tokenize(x))

progress-bar: 100%|██████████| 1624387/1624387 [11:20<00:00, 2387.06it/s]


In [14]:
n =len(X_train)
X_train_tokens = np.array(X_train.head(n).comment_text)

In [15]:
len(X_train_tokens)

1624387

In [16]:
n_dim = 200
w2v = Word2Vec(size=n_dim, min_count=10)
w2v.build_vocab([x for x in tqdm(X_train_tokens)])
w2v.train([x for x in tqdm(X_train_tokens)], total_examples=n, epochs=10)

100%|██████████| 1624387/1624387 [00:01<00:00, 854959.04it/s]
100%|██████████| 1624387/1624387 [00:01<00:00, 1009823.62it/s]


(716999094, 955367060)

In [17]:
w2v['good']

  """Entry point for launching an IPython kernel.


array([ 0.77221704,  0.9134255 ,  0.59492433, -0.42243227, -0.9646467 ,
        0.6077253 , -2.6252964 , -0.02229778,  1.2752267 , -2.565993  ,
        1.4675399 ,  0.4642416 , -1.3165927 , -1.2640836 ,  2.07373   ,
        0.44167107, -0.4946091 ,  2.670209  , -0.5604665 , -0.59328353,
       -0.47719222, -1.1471121 , -0.33900547, -0.98537636, -1.5570396 ,
        1.5371449 ,  1.603917  ,  2.3504314 , -0.4823531 , -1.0467993 ,
        0.99557644,  3.571098  , -2.1509225 , -0.86523235, -1.3256255 ,
        0.30106312,  0.07355017,  2.4655719 , -1.7510291 , -0.51174957,
        1.7694045 ,  0.9420105 , -0.62883425, -1.561578  , -0.40257058,
        1.6521319 , -0.6096398 ,  2.5896082 , -0.8107567 , -3.8858633 ,
        0.51300037,  0.80678153, -0.36605254, -0.10122604, -0.08534582,
        0.50588727,  0.8195502 ,  1.3798916 , -0.58301705, -0.44966882,
        0.02430057,  0.72771955, -1.440054  ,  1.7068397 , -0.82584566,
        1.3322213 , -0.7666941 ,  0.82119423,  0.83970606, -0.04

In [18]:
w2v.most_similar('president')

  """Entry point for launching an IPython kernel.


[(u'President', 0.9081345200538635),
 (u'POTUS', 0.8816766738891602),
 (u'president-elect', 0.706447184085846),
 (u'prez', 0.6907306909561157),
 (u'leader', 0.6872978210449219),
 (u'pres', 0.6713271737098694),
 (u'governor', 0.655921995639801),
 (u'nominee', 0.653347373008728),
 (u'candidate', 0.6508815884590149),
 (u'potus', 0.6495429277420044)]

In [19]:
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in X_train_tokens])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print 'vocab size :', len(tfidf)

building tf-idf matrix ...
vocab size : 83358


In [20]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [21]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x, X_train_tokens))])
train_vecs_w2v = scale(train_vecs_w2v)

  
100%|██████████| 1624387/1624387 [1:03:39<00:00, 425.28it/s]  


In [22]:
train_vecs_w2v[0]

array([-1.17400155e+00, -3.81570811e-01,  2.60891827e-01, -1.02098346e+00,
        1.70082224e-01, -4.96374635e-01, -3.65381386e-01, -1.49806016e-01,
       -6.30619389e-01,  3.13260099e-01, -4.76071223e-01,  3.18817708e-01,
        1.76434138e-01, -2.48037895e-02,  3.48019512e-01, -2.31588469e-01,
        1.13180690e+00,  4.83743075e-01,  3.51787163e-01,  1.13378385e-01,
        2.30597464e-01,  1.15528293e+00,  2.98077669e-01, -5.90046486e-01,
       -3.84583609e-01,  2.36381215e-01,  3.99946601e-01, -4.13608675e-02,
        5.94653190e-01, -3.05048930e-01,  8.70042554e-01,  9.50563352e-01,
        1.11380462e+00, -1.41038623e-03,  3.26366872e-01, -2.09679700e-01,
       -3.00376168e-02, -1.64077805e-01, -6.76065995e-01, -8.87663331e-01,
       -1.04987177e-01,  7.86936690e-01, -8.10187931e-02, -2.05129554e-01,
        1.40640866e+00, -7.78676971e-01,  9.28883623e-01, -1.06814729e+00,
       -1.03899027e+00, -3.77826313e-01, -5.19614172e-02,  6.66152058e-01,
       -4.09487148e-01,  

### predicting using a simple relu and sigmoid

In [29]:
from tensorflow.keras import layers

y_train_tokens = np.array(y_train.head(n).target)
y_train_tokens = np.where(y_train['target'] >= 0.5, 1, 0)

model = tf.keras.Sequential()
model.add(layers.Dense(32, activation='relu', input_dim=200))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train_tokens, validation_split=0.1, epochs=10, batch_size=32, verbose=1)

Train on 1461948 samples, validate on 162439 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a70b67490>

In [30]:
X_test_tokens = np.array(X_test.head(n).comment_text)

In [31]:
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x, X_test_tokens))])
test_vecs_w2v = scale(test_vecs_w2v)

  
  if word in self.vocab:
100%|██████████| 180487/180487 [13:50<00:00, 217.33it/s]


In [33]:
y_test_tokens = np.array(y_test.head(n).target)
y_test_tokens = np.where(y_test['target'] >= 0.5, 1, 0)

In [36]:
model.evaluate(test_vecs_w2v, y_test_tokens, batch_size=1, verbose=1)



[0.3841848376119202, 0.90890205]

In [148]:
predictions = model.predict(train_vecs_w2v, batch_size=32, verbose=1)



In [150]:
predictions[0:100]

array([[0.10550284],
       [0.10020366],
       [0.15637097],
       [0.05361888],
       [0.09228337],
       [0.01896101],
       [0.08856001],
       [0.13741252],
       [0.107575  ],
       [0.14846838],
       [0.13224417],
       [0.0775145 ],
       [0.12619272],
       [0.09744084],
       [0.14715275],
       [0.08488065],
       [0.09309408],
       [0.07384944],
       [0.06016546],
       [0.16639474],
       [0.07065225],
       [0.24587655],
       [0.20288125],
       [0.04723823],
       [0.05556503],
       [0.07717627],
       [0.14939386],
       [0.14545235],
       [0.08786052],
       [0.06645051],
       [0.10472992],
       [0.09580493],
       [0.08110386],
       [0.10877705],
       [0.14006767],
       [0.10075536],
       [0.12203959],
       [0.14293498],
       [0.15352672],
       [0.05373228],
       [0.19092345],
       [0.2906788 ],
       [0.08364666],
       [0.03538153],
       [0.06459513],
       [0.0977957 ],
       [0.07322899],
       [0.054