# DistilBERT
Radoslav Evtimov, Martin Falli, Amanda Maiwald

In [0]:
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
from datetime import datetime
import numpy as np
import torch
import sklearn
import scipy

In [0]:
# setup Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Directory structure
base_dir = '/content/drive/My Drive/BERT/BERT_Code_Input_Output/'
data_dir = base_dir + 'Data/'
model_dir = base_dir + 'simpletransformers'

In [0]:
# install simpletransformers
!pip install simpletransformers

In [0]:
# setup apex
%%writefile setup.sh

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --no-cache-dir ./

In [0]:
!sh setup.sh

In [0]:
# install transformers
!pip install transformers
!pip install seqeval
!pip install tensorboardx

In [0]:
# Load data
train = pd.read_csv(data_dir + 'train_cleaned_no_punkt.csv') 

In [0]:
# Transform data
train['mal'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1
train['nonmal'] = train['mal'] != 1
train['mal'] = train['mal'].astype(int)
train['nonmal'] = train['nonmal'].astype(int)
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

In [0]:
train.head(5)

In [0]:
# put data into format for simpletransformers
del train['id']
del train['nonmal']

In [0]:
train.head(15)

In [0]:
# split
train, test = train_test_split(train, test_size = 0.2, stratify = train.mal)

In [0]:
# build DistilBERT
from simpletransformers.classification import ClassificationModel

model = ClassificationModel('distilbert', 'distilbert-base-uncased-distilled-squad', num_labels = 2, args = {'fp16': False, 
                                                                                                                              'train_batch_size': 32, 
                                                                                                                              'eval_batch_size': 32, 
                                                                                                                              'gradient_accumulation_steps': 1,
                                                                                                                              'learning_rate': 3e-5,
                                                                                                                              'num_train_epochs': 3,
                                                                                                                              'max_seq_length': 128,
                                                                                                                              'overwrite_output_dir' : True})

In [0]:
# train model
history_1 = model.train_model(train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


Features loaded from cache at cache_dir/cached_train_distilbert_128_2_127656


HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=3990, style=ProgressStyle(description…

Running loss: 0.534370



Running loss: 0.004512

HBox(children=(IntProgress(value=0, description='Current iteration', max=3990, style=ProgressStyle(description…

Running loss: 0.043346

HBox(children=(IntProgress(value=0, description='Current iteration', max=3990, style=ProgressStyle(description…

Running loss: 0.000242Training of distilbert model complete. Saved to outputs/.


In [0]:
import sklearn
result, model_outputs, wrong_predictions = model.eval_model(test, f1 = sklearn.metrics.f1_score)

In [0]:
model_outputs

array([[ 4.4578333, -4.9872546],
       [ 4.087954 , -4.5425353],
       [ 4.3954716, -4.8844934],
       ...,
       [-2.6226301,  3.5583413],
       [ 4.2153974, -4.664733 ],
       [ 4.347291 , -4.870133 ]], dtype=float32)

In [0]:
from scipy.special import softmax

In [0]:
softmax_1 = softmax(model_outputs, axis=1)

In [0]:
softmax_1 = pd.DataFrame (softmax_1)
softmax_1

Unnamed: 0,0,1
0,0.999921,0.000079
1,0.999822,0.000179
2,0.999907,0.000093
3,0.999922,0.000078
4,0.999876,0.000124
...,...,...
31910,0.091790,0.908210
31911,0.999826,0.000174
31912,0.002064,0.997936
31913,0.999861,0.000139


In [0]:
test_1 = test.copy()
test_1['bla'] = test_1['mal'] != 1

In [0]:
test_1.bla = test_1.bla.astype(int)

In [0]:
del test_1['comment_text']

In [0]:
test_1

Unnamed: 0,bla,mal
48419,1,0
9141,1,0
118158,1,0
99043,1,0
138180,1,0
...,...,...
43332,0,1
90047,1,0
56444,0,1
129458,1,0


In [0]:
test_1 =  test_1[['bla', 'mal']]

In [0]:
auc_pr = sklearn.metrics.average_precision_score(test_1, softmax_1)
auc_pr

0.9550713790138197

In [0]:
auc_roc = sklearn.metrics.roc_auc_score (test_1, softmax_1)
auc_roc

0.9821428394842109