# Train model

In [None]:
from pathlib import Path
import pandas as pd
from simpletransformers.classification import MultiLabelClassificationModel

from constants import TOXIC_COMMENTS_DIR

In [None]:
raw_train = pd.read_csv(TOXIC_COMMENTS_DIR / 'train.csv')
labels = list(raw_train.columns[2:])

train_df = pd.DataFrame()
train_df['text'] = raw_train['comment_text']
train_df['labels'] = raw_train[labels].values.tolist()

train_df.head()

In [None]:
raw_test_comments = pd.read_csv(TOXIC_COMMENTS_DIR / 'test.csv')
raw_test_labels = pd.read_csv(TOXIC_COMMENTS_DIR / 'test_labels.csv')
raw_test_labels = raw_test_labels[raw_test_labels['toxic'] != -1]
raw_test = raw_test_comments.merge(raw_test_labels)

test_df = pd.DataFrame()
test_df['text'] = raw_test['comment_text']
test_df['labels'] = raw_test_labels[labels].values.tolist()

test_df.head()

In [None]:
# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel(
    'distilbert',
    'distilbert-base-uncased',
    num_labels=len(labels),
    args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 3}
)

# Train the model
model.train_model(train_df)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)
print(result)
print(model_outputs)

predictions, raw_outputs = model.predict(['This thing is entirely different from the other thing. '])
print(predictions)
print(raw_outputs)

In [None]:
predictions, test_outputs = model.predict(raw_test_comments['comment_text'].tolist())

sub_df = pd.DataFrame(test_outputs, columns=labels)
sub_df['id'] = raw_test_comments['id']
sub_df = sub_df[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
sub_df.to_csv('submission.csv', index=False)

# Predict all data

In [None]:
import itertools

def grouper_it(n, it):
    while True:
        chunk_it = itertools.islice(it, n)
        try:
            first_el = next(chunk_it)
        except StopIteration:
            return
        yield itertools.chain((first_el,), chunk_it)

In [None]:
CHUNK_SIZE = 100000
TOTAL_ROWS = 8003023
LABELS = (TOXIC_COMMENTS_DIR / 'classes.txt').read_text().split()

In [None]:
# Load model
model = MultiLabelClassificationModel(
    'distilbert',
    'outputs/',
    num_labels=len(LABELS)
)

In [None]:
from tqdm import tqdm_notebook

STORE = pd.HDFStore(DATA_DIR / 'toxicity_classifications.h5', mode='w')

with tqdm_notebook(total=TOTAL_ROWS) as pbar:
    for chunk in grouper_it(CHUNK_SIZE, TEXTS_DIR.iterdir()):
        pbar.set_description('Loading texts')
        paths = list(chunk)
        texts = [file.read_text() for file in paths]

        pbar.set_description('Making predictions')
        predictions, outputs = model.predict(texts)
        
        pbar.set_description('Concatenating predictions to dataframe')
        filenames = [file.name for file in paths]
        df = pd.DataFrame(filenames, columns=['filename'])
        df[LABELS] = pd.DataFrame(outputs)
        
        pbar.set_description('Appending to HDF file')
        STORE.append('df', df, expectedrows=TOTAL_ROWS, index=False)
            
        pbar.update(CHUNK_SIZE)

# Load data

In [1]:
from pathlib import Path
import pandas as pd

from constants import TOXICITY_CLASSIFICATIONS_H5, TOXICITY_SCORES_PICKLE

In [2]:
store = pd.HDFStore(TOXICITY_CLASSIFICATIONS_H5)
pred_df = store['df']

In [3]:
true_df_raw = pd.read_pickle(TOXICITY_SCORES_PICKLE)
true_df_raw.head()

Unnamed: 0,filename,text,chunk_num,toxicity,severe_toxicity,identity_attack,insult,threat,profanity,sexually_explicit,flirtation
0,0776249-6b69e163629d0603a2e57c0af9b77128.txt,I’m a unionist. I believe in the Union.\n\nI b...,0,0.148363,0.080491,0.402546,0.231428,0.26212,0.124949,0.146768,0.364588
1,0588122-39c89eab36c8e7ffbb186065859a61ea.txt,PC leadership hopefuls bring little to race\n\...,0,0.148575,,,,,,,
2,0716632-aeaed959671fe280ac6d11d3b66f594e.txt,But not because of the reasons you may believe...,0,0.143574,0.062929,0.26701,0.181736,0.328784,0.108916,0.178888,0.432094
3,0899501-3b54556f829125713e427e6509a713e6.txt,Foxtons has lost its second legal case in two ...,0,0.08049,0.047757,0.155986,0.109512,0.261333,0.090906,0.133572,0.373036
4,0000584-6470f33fa1e68138978182728c22fb4d.txt,The suspect in the apparent murder of a 22-yea...,0,0.065222,0.069232,0.244023,0.141029,0.47259,0.142407,0.245892,0.415427


In [4]:
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'threat']

# Drop NA rows (old API calls) and chunk_num col (large API calls)
true_df = true_df_raw.dropna()
del true_df['chunk_num']

# Remove all chunked data
true_df = true_df[~true_df.duplicated(subset='filename')]

# Remove columns not in pred_df
true_df = true_df[['filename', 'text', *labels]]

# Rename pred_df columns
pred_df.rename(columns={
    "toxic": "toxicity",
    "severe_toxic": "severe_toxicity",
    "threat": "threat",
    "insult": "insult",
    "identity_hate": "identity_attack"
}, inplace=True)
pred_df = pred_df[['filename', *labels]]

# Inner join on filename
merged_df = true_df.merge(pred_df, on='filename', suffixes=('_true', '_pred'))

# Split and binarize labels
y_true = merged_df[[l + '_true' for l in labels]] > 0.5
y_pred = merged_df[[l + '_pred' for l in labels]]
y_pred_binary = y_pred > 0.5

# Metrics

In [41]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, multilabel_confusion_matrix

In [31]:
accuracy_score(y_true, y_pred_binary)

0.8957028472918864

In [44]:
dict(zip(labels, roc_auc_score(y_true, y_pred, average=None)))

{'toxicity': 0.8640192605222321,
 'severe_toxicity': 0.8368362267292841,
 'identity_attack': 0.7510143725259711,
 'insult': 0.8378905527190207,
 'threat': 0.7519223262803563}

In [34]:
print(classification_report(y_true, y_pred_binary, target_names=labels))

                 precision    recall  f1-score   support

       toxicity       0.65      0.09      0.16     17750
severe_toxicity       1.00      0.00      0.00     11527
identity_attack       1.00      0.00      0.00     44692
         insult       0.96      0.01      0.02     27681
         threat       1.00      0.00      0.00     45259

      micro avg       0.68      0.01      0.03    146909
      macro avg       0.92      0.02      0.04    146909
   weighted avg       0.95      0.01      0.02    146909
    samples avg       0.00      0.00      0.00    146909



In [40]:
multilabel_confusion_matrix(y_true, y_pred_binary)

array([[[795792,    881],
        [ 16137,   1613]],

       [[802896,      0],
        [ 11520,      7]],

       [[769731,      0],
        [ 44658,     34]],

       [[786731,     11],
        [ 27408,    273]],

       [[769164,      0],
        [ 45253,      6]]])