In [1]:
import os
import re
import gc

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling1D, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC
from keras.callbacks import Callback, EarlyStopping
from keras import backend as K

from transformers import DistilBertTokenizer, TFDistilBertModel

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

DATA_PATH = './data'
SOLUTION_PATH = './solutions'
WEIGHTS_PATH = './model_weights'

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

In [2]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

In [3]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train['comment_text'].head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\n', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
train['comment_text'] = train['comment_text'].map(lambda x : clean_text(x))
test['comment_text'] = test['comment_text'].map(lambda x : clean_text(x))

In [7]:
train[train.columns[2:]].sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [8]:
model_name = 'distilbert-base-uncased'
max_length = 60

tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
distilbert = TFDistilBertModel.from_pretrained(model_name, output_hidden_states=True, return_dict=True)

2022-04-28 22:47:16.926518: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2022-04-28 22:47:17.062988: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93763584 exceeds 10% of free system memory.
2022-04-28 22:47:17.188644: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93763584 exceeds 10% of free system memory.
2022-04-28 22:47:17.207008: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93763584 exceeds 10% of free system memory.
2022-04-28 22:47:18.150018: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93763584 exceeds 10% of free system memory.
2022-04-28 22:47:18.205887: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 93763584 exceeds 10% of free system memory.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['voc

In [18]:
tokenizer.get_vocab()['cat']

4937

In [19]:
tokenizer.tokenize("fuck you soyboy!")

['fuck', 'you', 'soy', '##boy', '!']

In [15]:
%%time
x = tokenizer(
    text=list(train['comment_text'].values[:5]),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

CPU times: user 6.95 ms, sys: 0 ns, total: 6.95 ms
Wall time: 6.63 ms


In [None]:
def get_model():
    input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
    attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
    inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
    output = distilbert.distilbert(inputs)
    
    y = concatenate([GlobalAveragePooling1D()(output['hidden_states'][i]) for i in range(-1, -4, -1)])

    model = Model(inputs=inputs, outputs=y)

    return model

model = get_model()
model.summary()

In [34]:
%%time
x = tokenizer(
    text=list(X.values[:, 0]),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

CPU times: user 1min 16s, sys: 70.7 ms, total: 1min 16s
Wall time: 1min 16s


In [35]:
%%time
output = model.predict(x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
                       batch_size=16)

CPU times: user 3h 12min 22s, sys: 1min 26s, total: 3h 13min 49s
Wall time: 28min 19s


In [37]:
# np.save(os.path.join(DATA_PATH, 'train_data_embeddings3'), np.hstack((output, X.values[:, 1:])))

In [53]:
res = np.load(os.path.join(DATA_PATH, 'train_data_embeddings_250_3072.npy'), allow_pickle=True)

In [54]:
res.shape

(159571, 3072)

In [40]:
np.array_equal(res, np.hstack((output, X.values[:, 1:])))

True

In [45]:
res = np.load(os.path.join(DATA_PATH, 'train_data_embeddings_125.npy'), allow_pickle=True)
y = np.load(os.path.join(DATA_PATH, 'train_label.npy'), allow_pickle=True)

In [46]:
res.shape, y.shape

((159571, 768), (159571, 6))

In [2]:
os.listdir(DATA_PATH)

['test.csv',
 'train.csv',
 'test_labels.csv',
 'train_label.npy',
 'sample_submission.csv',
 'test_data_embeddings_sst_250_2304.npy',
 'train_data_embeddings_sst_250_2304.npy']

In [10]:
res = np.load(os.path.join(DATA_PATH, 'train_data_embeddings_sst_250_2304.npy'), allow_pickle=True)
y = np.load(os.path.join(DATA_PATH, 'train_label.npy'), allow_pickle=True)
res_test = np.load(os.path.join(DATA_PATH, 'test_data_embeddings_sst_250_2304.npy'), allow_pickle=True)

In [6]:
def get_final_model():
    input_vec = Input(shape=(768*3,), name='polled_embedding', dtype='float32')
         
    x = Dense(64, activation='relu', name='hidden')(input_vec)
    x = Dropout(0.2)(x)
    y = Dense(len(list_classes), activation='sigmoid', name='outputs')(x)

    model = Model(inputs=input_vec, outputs=y)
    
    optimizer = Adam()
    loss = BinaryCrossentropy()
    metrics = AUC()
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=metrics)
    return model

ffn = get_final_model()
ffn.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 polled_embedding (InputLaye  [(None, 2304)]           0         
 r)                                                              
                                                                 
 hidden (Dense)              (None, 64)                147520    
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 outputs (Dense)             (None, 6)                 390       
                                                                 
Total params: 147,910
Trainable params: 147,910
Non-trainable params: 0
_________________________________________________________________


In [106]:
class RocAucEvaluation(Callback):
    def __init__(self, patience, validation_data=()):
        super(Callback, self).__init__()
        
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0
        self.patience = patience

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        
        print(f'epoch: {epoch}\nroc_auc: {round(score, 4)}')
        logs['roc_auc'] = score
        
        if score > self.max_score:
            self.max_score = score
            self.not_better_count = 0
            self.model.save(os.path.join(WEIGHTS_PATH, 'ffn'))
        else:
            self.not_better_count += 1
            if self.not_better_count > self.patience:
                self.model.stop_training = True
                print()
                print(f'Best roc_auc score: {round(self.max_score, 4)}')
                print('Early Sropping triggered.')

In [99]:
X_train, X_test, y_train, y_test = train_test_split(res[:, :768*3], y, test_size=0.25, random_state=42)


rocauc_early_stopping = RocAucEvaluation(patience=5,
                                         validation_data=(X_test.astype('float32'), 
                                                          y_test.astype('int8'))
                                        )



history = ffn.fit(x=X_train.astype('float32'), 
                  y=y_train.astype('int8'),
                  callbacks=[rocauc_early_stopping],
                  batch_size=64,
                  epochs=2, 
                  verbose=1)

Epoch 1/2
Epoch 2/2


In [9]:
K.clear_session()
early_stopping = EarlyStopping(monitor='val_auc', patience=5, restore_best_weights=True)

history = ffn.fit(x=res[:, :768*3].astype('float32'), 
                  y=y.astype('int8'),
                  validation_split=0.2,
                  callbacks=[early_stopping],
                  batch_size=64,
                  epochs=100, 
                  verbose=1)

Epoch 1/100


2022-04-27 23:58:11.728850: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1176477696 exceeds 10% of free system memory.




2022-04-27 23:58:17.038325: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 294128640 exceeds 10% of free system memory.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


In [31]:
# history.history['roc_auc']

In [90]:
list_classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [108]:
y_pred = ffn.predict(X_test.astype('float32'))
score = roc_auc_score(y_test.astype('int8'), y_pred, average=None)
score, np.mean(score)

(array([0.9757575 , 0.99098338, 0.98627091, 0.99345567, 0.98136015,
        0.98507512]),
 0.9854837884159305)

In [69]:
reconstructed_model = tf.keras.models.load_model(os.path.join(WEIGHTS_PATH, 'ffn'))

In [70]:
# new
y_pred = reconstructed_model.predict(X_test.astype('float32'))
score = roc_auc_score(y_test.astype('int8'), y_pred, average=None)
score, np.mean(score)

(array([0.97027139, 0.98849069, 0.9812639 , 0.98898447, 0.97704308,
        0.977443  ]),
 0.9805827541562797)

# Test

In [None]:
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
test['comment_text'] = test['comment_text'].map(lambda x : clean_text(x))

In [None]:
x_test = tokenizer(
    text=list(test['comment_text'].values),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
output = model.predict(x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
                       batch_size=16)

In [None]:
# np.save(os.path.join(DATA_PATH, 'test_data_embeddings'), output)

In [62]:
res_test = np.load(os.path.join(DATA_PATH, 'test_data_embeddings_250_3072.npy'), allow_pickle=True)

In [11]:
# predict = reconstructed_model.predict(res_test.astype('float32'), batch_size=16)
predict = ffn.predict(res_test[:, :768*3].astype('float32'), batch_size=16)

2022-04-28 00:00:59.909149: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1411559424 exceeds 10% of free system memory.


In [12]:
sample_submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
sample_submission[list_classes] = predict
sample_submission.to_csv(os.path.join(SOLUTION_PATH, 'distilbert_sst_250_2304.csv'), index=False)

# test sst2

In [168]:
num_folds = 5
epochs = 100
batch_size = 64
kf = KFold(n_splits=num_folds, shuffle=True, random_state=30)

predict = np.zeros((res_test.shape[0], len(list_classes)))

for train_index, test_index in kf.split(X_train):
    y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
    X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
    
    K.clear_session()
    ffn = get_final_model()
    
    rocauc_early_stopping = RocAucEvaluation(patience=5,
                                             validation_data=(X_val_fold.astype('float32'), 
                                                              y_val_fold.astype('int8')))
    ffn.fit(x=X_train_fold.astype('float32'), 
            y=y_train_fold.astype('int8'),
            callbacks=[rocauc_early_stopping],
            batch_size=batch_size,
            epochs=epochs, 
            verbose=0)
    
    # load best model
    reconstructed_model = tf.keras.models.load_model(os.path.join(WEIGHTS_PATH, 'ffn'))
    
    predict += reconstructed_model.predict(res_test.astype('float32')) / num_folds

epoch: 0
roc_auc: 0.976
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
epoch: 10
roc_auc: 0.9813
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
epoch: 20
roc_auc: 0.9811

Best roc_auc score: 0.9816
Early Sropping triggered.
epoch: 0
roc_auc: 0.9743
INFO:tensorflow:Assets written to: ./model_weights/ffn/assets
INFO:tensorflow:Assets written to: ./mo

In [169]:
sample_submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
sample_submission[list_classes] = predict
sample_submission.to_csv(os.path.join(SOLUTION_PATH, 'distilbert_125.csv'), index=False)

In [164]:
blend = pd.read_csv(os.path.join(SOLUTION_PATH, 'old_blend.csv'))
blend[list_classes] = (blend[list_classes]*0.9 + sample_submission[list_classes]*0.1)
blend.to_csv(os.path.join(SOLUTION_PATH, 'blend_with_bert.csv'), index=False)