In [1]:
import os
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import numpy as np
from transformers import TFAutoModel
from sklearn import metrics
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate, MaxPool1D
from tensorflow.keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D,MaxPooling1D, Flatten
from tensorflow.keras.layers import LSTM, GRU, Dropout
from tensorflow.keras import backend as K
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Lambda
import warnings
from transformers import BertTokenizer
from transformers import TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')
import gc
np.random.seed(21)

In [2]:
BATCH_SIZE = 32
bert_model_name = 'dbmdz/convbert-base-german-europeana-cased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 512

In [3]:
train =pd.read_csv("../data/train_ft.csv")
train_aug=pd.read_csv("../data/itrain_aug.csv") # Augmentation ...
train=train.append(train_aug[train_aug["Sub1_Toxic"]==1].sample(600),ignore_index=True)
train.drop_duplicates('text',inplace=True)
train.reset_index(drop=True, inplace=True)
test=pd.read_csv("../data/test_ft.csv")
FEATURES=['readability','!','?',"caps_vs_length","Partizip II","Präteritum_ich","punc","error",'Präsens_ich',"present","future",'words_vs_unique',"pos","neg","num_urls","mod","emoji","certainity","uncertainity","num_words"]
features = train[FEATURES].fillna(0)
test_features = test[FEATURES].fillna(0)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

In [4]:
#Function for cutting off the middle part of long texts.
def text_process(text):
    ws = text.split(' ')
    if(len(ws)>500):
        text = ' '.join(ws[:500]) + ' ' + ' '.join(ws[-12:])
    return text

In [5]:
y_train = train[["Sub1_Toxic","Sub2_Engaging","Sub3_FactClaiming"]].values
X_train = train['text'].apply(lambda x: text_process(str(x))).fillna("etwas").values.tolist()
X_test = test['text'].apply(lambda x: text_process(str(x))).fillna("etwas").values.tolist()

In [6]:
def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,
                            truncation=True,               
                            add_special_tokens = True, 
                            max_length = max_seq_len,
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return np.array(tokenized_sentences)

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)
def regular_encode(texts,tokenizer,maxlen=MAX_LEN):
  input_ids = tokenize_sentences(texts, tokenizer, MAX_LEN)
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
  attention_masks = create_attention_masks(input_ids)
  return input_ids,attention_masks

In [7]:
x_test,x_test_att = regular_encode(X_test, tokenizer, maxlen=MAX_LEN)
x_train,x_train_att = regular_encode(X_train,tokenizer,maxlen=MAX_LEN)

  0%|          | 0/944 [00:00<?, ?it/s]

  0%|          | 0/3316 [00:00<?, ?it/s]

In [8]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("../checkpoints/convcnn.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [9]:
def get_model(bert_model, features ,clipvalue=1.,num_filters=40,dropout=0.5,max_len=512):
    import tensorflow as tf
    features_input = Input(shape=(features.shape[1],))
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    attention_masks = Input(shape=(max_len,), dtype=tf.int32, name="input_att_masks")
    bert_output = bert_model(input_ids, attention_mask=attention_masks)
    hidden_states= bert_output.hidden_states
    last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
    x= tf.concat(last_four_layers,-1)
    convs = []
    filter_sizes = [2,3,4,5]
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(x)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs,axis=1)
    x = Dropout(0.5)(l_merge)  
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    conc = concatenate([x,features_input])
    outp = Dense(3, activation="sigmoid")(conc)
    model = Model(inputs=[input_ids,attention_masks,features_input], outputs=outp)
    import tensorflow as tf
    adam = tf.optimizers.Adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    for layer in model.layers[:3]:
      layer.trainable=False
    return model

In [10]:
transformer_layer = TFAutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
for layer in transformer_layer.layers:
      layer.trainable=False
      for w in layer.weights:
        w._trainable=False
transformer_layer.compile()
model = get_model(transformer_layer, features)
model.summary()

Some layers from the model checkpoint at dbmdz/convbert-base-german-europeana-cased were not used when initializing TFConvBertModel: ['generator_lm_head', 'generator_predictions']
- This IS expected if you are initializing TFConvBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFConvBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFConvBertModel were initialized from the model checkpoint at dbmdz/convbert-base-german-europeana-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFConvBertModel for predictions without further training.


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_att_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_conv_bert_model (TFConvBertM TFBaseModelOutput(la 106815624   input_word_ids[0][0]             
                                                                 input_att_masks[0][0]            
__________________________________________________________________________________________________
tf_op_layer_concat (TensorFlowO [(None, 512, 3072)]  0           tf_conv_bert_model[0][

In [None]:
transformer_layer = TFAutoModel.from_pretrained(bert_model_name, output_hidden_states=True)
for layer in transformer_layer.layers:
      layer.trainable=False
      for w in layer.weights:
        w._trainable=False
transformer_layer.compile()
model = get_model(transformer_layer, features)
model.summary()
epochs = 100
gc.collect()
K.clear_session()

num_folds = 10

predict = np.zeros((test.shape[0],3))
i=0
kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)
x_train=np.asarray(x_train).astype(np.float32)
x_test=np.asarray(x_test).astype(np.float32)
for train_index, test_index in kf.split(x_train):
    i+=1
    print(f"fold: {i}")
    
    kfold_y_train,kfold_y_test = y_train[train_index], y_train[test_index]
    kfold_X_train = x_train[train_index]
    kfold_X_train_att = x_train_att[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = x_train[test_index]
    kfold_X_valid_att = x_train_att[test_index]
    kfold_X_valid_features = features[test_index] 
    gc.collect()
    K.clear_session()
    model = get_model(transformer_layer, features)
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_att,kfold_X_valid_features], kfold_y_test), interval = 1)
    
    model.fit([kfold_X_train,kfold_X_train_att,kfold_X_features], kfold_y_train, batch_size=BATCH_SIZE, epochs=epochs, verbose=1,
            callbacks = [ra_val])
    gc.collect()
    model.load_weights("../checkpoints/convcnn.h5")
    
    predict += model.predict([x_test,x_test_att,test_features], batch_size=BATCH_SIZE,verbose=1) / num_folds
print("Done")

In [None]:
test[["1","2","3"]]=predict
test[["1","2","3"]].to_csv("../submissions/convcnn-p.csv")
test[["1","2","3"]]=predict.round(0).astype(int)
test[["1","2","3"]].to_csv("../submissions/convcnn.csv")

In [2]:
! rm -rf ../checkpoints/convcnn.h5