<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Importing-packages-&amp;-dataset" data-toc-modified-id="Importing-packages-&amp;-dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Importing packages &amp; dataset</a></span></li><li><span><a href="#Feature-engineering-the-labels" data-toc-modified-id="Feature-engineering-the-labels-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature engineering the labels</a></span></li><li><span><a href="#Cleaning-the-comments-column" data-toc-modified-id="Cleaning-the-comments-column-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning the comments column</a></span><ul class="toc-item"><li><span><a href="#Applying-the-cleaning-functions-used-in-the-main-notebook" data-toc-modified-id="Applying-the-cleaning-functions-used-in-the-main-notebook-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Applying the cleaning functions used in the main notebook</a></span></li><li><span><a href="#Removing-everything-that-isn't-a-letter-or-a-whitespace" data-toc-modified-id="Removing-everything-that-isn't-a-letter-or-a-whitespace-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Removing everything that isn't a letter or a whitespace</a></span></li><li><span><a href="#Removing-stop-words-&amp;-lemmatising" data-toc-modified-id="Removing-stop-words-&amp;-lemmatising-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Removing stop-words &amp; lemmatising</a></span></li></ul></li><li><span><a href="#Converting-the-data-into-a-spaCy-compatible-format-&amp;-train-test-split" data-toc-modified-id="Converting-the-data-into-a-spaCy-compatible-format-&amp;-train-test-split-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Converting the data into a spaCy-compatible format &amp; train-test split</a></span></li><li><span><a href="#Training-a-TextCategoriser-model" data-toc-modified-id="Training-a-TextCategoriser-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Training a <a href="https://spacy.io/api/textcategorizer" rel="nofollow" target="_blank">TextCategoriser</a> model</a></span></li><li><span><a href="#Applying-the-model-to-unseen-data" data-toc-modified-id="Applying-the-model-to-unseen-data-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Applying the model to unseen data</a></span></li></ul></div>

# Supplementary workbook: Classifying Toxicity Levels using additionally pre-processed data

## Importing packages & dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(precision=4)
sns.set(font_scale=1.5)
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
df = pd.read_csv("/Users/anastasiakuzmich/Desktop/jigsaw-toxic-comment-classification-challenge/train.csv")

print("The dataset contains %s entries with %s features." 
      % (df.shape[0], df.shape[1]))

The dataset contains 159571 entries with 8 features.


## Feature engineering the labels

In [3]:
df['total_score'] = (df['toxic'] 
                     + df["severe_toxic"] 
                     + df["obscene"] 
                     + df["threat"]
                     + df["insult"]
                     + df["identity_hate"])

df['total_score'].value_counts(normalize=True)

0    0.898321
1    0.039857
3    0.026377
2    0.021808
4    0.011030
5    0.002413
6    0.000194
Name: total_score, dtype: float64

In [4]:
def non_toxic_mapper(x):
    if x == 0: 
        return 1
    else:
        return 0

df["non_toxic"] = df["total_score"].map(non_toxic_mapper)
df["non_toxic"].value_counts()

1    143346
0     16225
Name: non_toxic, dtype: int64

In [5]:
df = df.drop("total_score", axis=1)

## Cleaning the comments column

### Applying the cleaning functions used in the main notebook

In [6]:
def non_toxic_mapper(x):
    
    """Maps the non-toxic column based on the total score column."""
    
    if x == 0: 
        return 1
    else:
        return 0

def add_non_toxic_column(df):
    
    """Adds a non-toxic column."""
    
    df['total_score'] = (df['toxic'] + df["severe_toxic"] 
                         + df["obscene"] + df["threat"]
                         + df["insult"] + df["identity_hate"])

    df["non_toxic"] = df["total_score"].map(non_toxic_mapper)
    df = df.drop("total_score", axis=1)
    
    return df

def clean_comments_column(df):
    
    """Cleans the comments column."""
    
    import re
    
    IpAddressRegex = "(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"
    DateTimeRegex = "(\d{2}\:\d{2}\, [A-Z][a-z]{2,8}\ \d{1,2}\, \d{4}\ \([A-Z]{3}\))|(\d{2}\:\d{2}\, \d{2}\ [A-Z][a-z]{2}\ \d{4}\ \([A-Z]{3}\))|(\d{2}\:\d{2}\, \d{1,2}\ [A-Z][a-z]{2,8} \d{4})|(\d{1,2}\ [A-Z][a-z]{2,8}\ \d{4}\ \([A-Z]{3}\))|(\d{1,2}\ [A-Z][a-z]{1,7}\ \d{4})|(\d{2}\:\d{2}\, \d{1,2})|([A-Z][a-z]{2,8}\ \d{4}\ \([A-Z]{3}\))"
    
    features_to_remove = [IpAddressRegex,
                          DateTimeRegex,
                          '\\n', r'\\']
    
    for feature in features_to_remove:
        df["comment_text"] = df["comment_text"].replace(feature,' ', regex=True)
 
    df['comment_text'] = df['comment_text'].replace(u'\xa0', u' ')
    
    return df

In [7]:
df = clean_comments_column(df)

### Removing everything that isn't a letter or a whitespace

In [8]:
from tqdm.auto import tqdm

whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
new_comments = []

for comment in tqdm(df["comment_text"]):
    new_comment = ''.join(filter(whitelist.__contains__, comment)).lower()
    new_comments.append(new_comment)
    
df['comment_text'] = new_comments
del new_comments

  0%|          | 0/159571 [00:00<?, ?it/s]

###  Removing stop-words & lemmatising

In [9]:
import spacy
from tqdm.auto import tqdm
import time
nlp = spacy.load("en_core_web_md", exclude=["parser", "ner"])
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(spacy_stopwords))
print('First ten stop words: %s' % list(spacy_stopwords)[:10])
print(nlp.pipe_names)

Number of stop words: 326
First ten stop words: ['hers', 'through', 'becomes', 'get', 'move', 'you', 'us', 'be', 'off', 'top']
['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']


In [10]:
comments = df["comment_text"]
preprocessed_comments = []

for doc in tqdm(list(nlp.pipe(df["comment_text"]))):
    preprocessed_comment = [token.lemma_ for token in doc if not token.is_stop]
    preprocessed_comments.append(' '.join(preprocessed_comment))

  0%|          | 0/159571 [00:00<?, ?it/s]

In [11]:
for i in range(2):
    print(df["comment_text"][i])
    print(" ")
    print("Processed:", preprocessed_comments[i])
    print(" ")

explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now 
 
Processed: explanation edit username hardcore metallica fan revert nt vandalism closure gas vote new york doll fac nt remove template talk page m retire
 
daww he matches this background colour im seemingly stuck with thanks  talk  
 
Processed: daww match background colour m seemingly stick thank   talk  
 


In [12]:
df["comment_text"] = preprocessed_comments

In [13]:
del comments, preprocessed_comments

## Converting the data into a spaCy-compatible format & train-test split

In [14]:
# Extracting the labels
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic']]

# Labels variable to be used in the pipeline
labels = list(y.columns)

# Converting the labels to a spacy-compatible dictionary 
y = y.to_dict("index")

In [15]:
dataset = list(zip(df['comment_text'],[{'cats': cats} for cats in y.values()]))
print(dataset[0])

('explanation edit username hardcore metallica fan revert nt vandalism closure gas vote new york doll fac nt remove template talk page m retire', {'cats': {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0, 'non_toxic': 1}})


In [16]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, 
                                         train_size=0.8, 
                                         random_state=13)

In [17]:
print("SAMPLES IN TRAINING SET: ", len(train_data))
print("SAMPLES IN TESTING SET: ", len(test_data))
print(" ")
print("TRAINING EXAMPLE:", train_data[0])

SAMPLES IN TRAINING SET:  127656
SAMPLES IN TESTING SET:  31915
 
TRAINING EXAMPLE: ('okay one gon na address guess edit page care accuracy', {'cats': {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0, 'non_toxic': 1}})


## Training a [TextCategoriser](https://spacy.io/api/textcategorizer) model

In [18]:
nlp_train = spacy.blank("en")
nlp_train.pipe_names

[]

In [19]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

config = {"threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL}

textcat = nlp_train.add_pipe("textcat_multilabel", config=config)
print(nlp_train.pipe_names)

['textcat_multilabel']


In [20]:
for label in labels:
    textcat.add_label(label)
    
textcat.labels

('toxic',
 'severe_toxic',
 'obscene',
 'threat',
 'insult',
 'identity_hate',
 'non_toxic')

In [21]:
optimizer = nlp_train.begin_training()
iterations = 10

In [22]:
from tqdm.auto import tqdm
import time

from spacy.util import minibatch, compounding
from spacy.training import Example

In [23]:
with nlp.select_pipes(enable="textcat_multilabel"):
    for j in range(iterations):
        
        losses = {}
        batches = minibatch(train_data, size = compounding(4.,32.,1.001))
        pbar = tqdm(batches, total=5227)
        
        for batch in pbar:
            pbar.set_description(f"Iteration {j+1}")
            text, annotations = zip(*batch)
            example = []
            
            for i in range(len(text)):
                doc = nlp.make_doc(text[i])
                example.append(Example.from_dict(doc, annotations[i]))
            
            nlp.update(example, sgd=optimizer, drop=0.2, losses = losses)

#         print("\n\n Completed Iterations : {} ".format(j+1))

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

  0%|          | 0/5227 [00:00<?, ?it/s]

## Applying the model to unseen data

In [24]:
from spacy.scorer import Scorer

In [25]:
examples = []
scorer = Scorer()

for text, annotations in tqdm(test_data):
    doc = nlp_train.make_doc(text)
    example = Example.from_dict(doc, annotations)
    example.predicted = nlp_train(str(example.predicted))
    examples.append(example)

  0%|          | 0/31915 [00:00<?, ?it/s]

In [26]:
scorer.score_cats(examples, "cats", labels=labels)

{'cats_score': 0.4933680304932667,
 'cats_score_desc': 'macro AUC',
 'cats_micro_p': 0.10126473944465576,
 'cats_micro_r': 0.29965107771962407,
 'cats_micro_f': 0.15137386458940424,
 'cats_macro_p': 0.15654536416947587,
 'cats_macro_r': 0.4656874956408263,
 'cats_macro_f': 0.10811961766492087,
 'cats_macro_auc': 0.4933680304932667,
 'cats_f_per_type': {'toxic': {'p': 0.08680877355576151,
   'r': 0.2821285140562249,
   'f': 0.13276635955587054},
  'severe_toxic': {'p': 0.00923713967192228,
   'r': 0.19141914191419143,
   'f': 0.0176238225463385},
  'obscene': {'p': 0.04146072268343011,
   'r': 0.5320197044334976,
   'f': 0.07692650135778836},
  'threat': {'p': 0.002482128673550437,
   'r': 0.5555555555555556,
   'f': 0.004942176534545813},
  'insult': {'p': 0.05203945085988997,
   'r': 0.6986027944111777,
   'f': 0.09686346863468635},
  'identity_hate': {'p': 0.010448143405889884,
   'r': 0.7364620938628159,
   'f': 0.020603979396020603},
  'non_toxic': {'p': 0.8933411903358869,
   'r':

💡 There is a drastic decrease in performace when the input does not consist of natural sentences. 