In [1]:
import numpy as np
import pandas as pd

# Youtube toxic comments

About Dataset
This is a hand-labelled toxicity data set containing 1000 comments crawled from YouTube videos about the Ferguson unrest in 2014. In addition to toxicity, this data set contains labels for multiple subclassifications of toxicity which form a hierarchical structure. Each comment can have multiple of these labels assigned. The structure can be seen in the following enumeration:

__IsToxic__

_and sub-categories :_
- IsAbusive
- IsThreat
- IsProvocative
- IsObscene
- IsHatespeech
- IsRacist
- IsNationalist
- IsSexist
- IsHomophobic
- IsReligiousHate
- IsRadicalism

### loading and visualizing data:

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('drive/MyDrive/Colab notebooks/archive/youtoxic_english_1000.csv')
data = data.drop(['CommentId','VideoId'],axis=1)
data.head(3)

Unnamed: 0,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsSexist,IsHomophobic,IsReligiousHate,IsRadicalism
0,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False,False,False,False
1,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False,False,False,False
2,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False,False,False,False


# 1. Binary Classification : Toxic comment or not ?

## 1.1 TFIDF + Classifier

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(data, test_size=0.2)
x_train = train['Text'].values; y_train = train['IsToxic'].values;
x_test = test['Text'].values; y_test = test['IsToxic'].values;

### creating my sklearn Transformer 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin

class MyTransformer(TransformerMixin):
    def __init__(self, ):
        self.tfidf_vectorizer = TfidfVectorizer()
    
    def fit(self, X, y):
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.tfidf_vectorizer.transform(X)
        

## testing accuracy of different types of classifier :

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, LinearSVR
from sklearn.pipeline import Pipeline

models = [Pipeline([('vectorizer',MyTransformer()),
                 ("classifier",clf())]) for clf in [LogisticRegression, SVC, LinearSVC ]]

for model in models :    
    model.fit(x_train, y_train);
    print(str(model['classifier'])[:-2], 'accuracy:', f'{100*model.score(x_test,y_test)}%')

LogisticRegression accuracy: 70.5%
SVC accuracy: 71.5%
LinearSVC accuracy: 73.5%


## 1.2 Using NLP pretrained model BERT + keras NN

In [8]:
! pip install transformers
! pip install tokenization
! pip install bert-tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 32.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 76.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenization
  Downloading tokenization-1.0.7-py3-none-any.whl (10 kB)
Installing collected packages: tokenization
Su

In [36]:
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow as tf
import tensorflow_hub as hub

m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=False)

In [64]:
from bert import tokenization
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [65]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    # lay = tf.keras.layers.Dropout(0.2)(lay)
    # lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    # lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(1, activation='sigmoid')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-4), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [66]:
import sys
from absl import flags
sys.argv=['preserve_unused_tokens=False']
flags.FLAGS(sys.argv)

max_len = 150
train_input = bert_encode(train['Text'].values, tokenizer, max_len=max_len)
test_input = bert_encode(test["Text"].values, tokenizer, max_len=max_len)

In [67]:
train_labels = train['IsToxic']
test_labels = test['IsToxic']

In [68]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 150)]        0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     multiple             109482241   ['input_word_ids[0][0]',         
                                                                  'input_mask[0][0]',       

In [69]:
model.fit(
    train_input, train_labels,
    epochs=10,
    validation_data=(test_input,test_labels),
    batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efa4d50a810>