In [1]:
import numpy as np 
import pandas as pd 
import os

In [2]:
!pip install transformers



In [3]:
def extract_data(path_in,path_out,col):
    data_df = pd.read_csv(path_in)
    data_df = data_df[col].fillna(0)
    data_df.to_csv(path_out,index=False)

In [4]:
def split_data(data_df):
    n = len(data_df)
    train_df = data_df[:int(0.9*n)]
    validate_df = data_df[int(0.9*n):]
    return train_df,validate_df

In [5]:
aux_columns = ['severe_toxicity','obscene','identity_attack','insult','threat','sexual_explicit']
identity_columns = [ 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
                      'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
spe_columns = ['black','white','homosexual_gay_or_lesbian','muslim']          

path_out = "/kaggle/input/jigsawnlp/reduced_train.csv"

if  not os.path.exists(path_out):
    path_in = "/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv"
    #col = ['comment_text','target']+aux_columns+identity_columns
    col = ['target']+aux_columns+identity_columns
    extract_data(path_in,path_out,col)

data_df = pd.read_csv(path_out)
train_df,validate_df = split_data(data_df)

In [6]:
!cp /kaggle/input/jigsawnlp1/extract.py extract.py
!cp /kaggle/input/jigsawnlp1/metrics.py metrics.py
!cp /kaggle/input/jigsawnlp1/warmup.py warmup.py

In [7]:
def adapt_weight(df):
    #The based weight
    weights = np.ones(len(df)) / 4

    # Subgroup  positive  
    temp_index = (df[identity_columns].values>=0.5).max(axis=1)
    weights[temp_index] += 0.25

    # Background Positive, Subgroup Negative
    temp_index = (df['target'].values>=0.5) * ((df[identity_columns].values<0.5).sum(axis=1) == len(identity_columns))
    weights[temp_index] += 0.25

    # Background Negative, Subgroup Positive
    temp_index = (df['target'].values<0.5) * (df[identity_columns].values>=0.5).max(axis=1)
    weights[temp_index] += 0.25

    # Background Positive, special-Subgroup Negative
    temp_index = (df['target'].values>=0.5) * ((df[spe_columns].values<0.5).sum(axis=1) == len(spe_columns))
    weights[temp_index] += 0.125

    # Background Positive, special-Subgroup Positive  
    temp_index = (df['target'].values<0.5) * (df[spe_columns].values>=0.5).max(axis=1)
    weights[temp_index] += 0.125

    return weights

In [8]:
# weights for different training samples, weights_scale is used for normalization
weights = adapt_weight(train_df)
weights_scale = 1./weights.mean()

In [9]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.losses import  binary_crossentropy

from extract import Extract
from warmup import AdamWarmup
import metrics

2023-01-20 13:27:38.454987: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-01-20 13:27:38.455043: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [10]:
# TPU activation
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

Running on TPU grpc://10.0.0.2:8470


2023-01-20 13:27:40.538379: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-01-20 13:27:40.538747: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2023-01-20 13:27:40.538771: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-20 13:27:40.538800: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f66320cd2926): /proc/driver/nvidia/version does not exist
2023-01-20 13:27:40.539375: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

In [11]:
from transformers import pipeline
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#model = TFBertModel.from_pretrained("bert-base-uncased")

In [12]:
lr = 3e-5
weight_decay = 0.01
nb_epochs=1
bsz = 64
max_len=220

decay_steps = int(1.2*nb_epochs*len(train_df)/bsz)
warmup_steps = int(0.05*decay_steps)

In [13]:
def custom_loss(y_true, y_pred):
    # y_true : shape is n*2, cols = ['target', 'weights']
    return binary_crossentropy(K.reshape(y_true[:,0],(-1,1)),y_pred,from_logits=False) * y_true[:,1]

In [14]:
def createmodel(model, weights_scale, max_len= 220):
    adamwarm = AdamWarmup(decay_steps = decay_steps, warmup_steps = warmup_steps, 
                          learning_rate=lr, weight_decay = weight_decay,weight_decay_pattern=["embeddings","kernel","weight"])

    #define 3 types of input
    input_ids = tf.keras.layers.Input(shape = (max_len,),dtype='int32', name = 'input_ids')
    token_type_ids = tf.keras.layers.Input(shape = (max_len,),dtype='int32', name = 'token_type_ids')
    attention_mask = tf.keras.layers.Input(shape = (max_len,),dtype='int32', name = 'attention_mask')

    #
    last_hidden_states = model(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask= attention_mask)[0]
    cls_output = Extract(0)(last_hidden_states)

    pool_output = tf.keras.layers.Dense(units=1, activation='sigmoid',name='real_output')(cls_output)
    aux_output = tf.keras.layers.Dense(units=6, activation='sigmoid',name='aux_output')(cls_output)


    train_model  = tf.keras.models.Model(inputs=[input_ids,token_type_ids,attention_mask], outputs=[pool_output,aux_output])
    train_model.compile(loss=[custom_loss,'binary_crossentropy'],loss_weights=[weights_scale,6.], optimizer=adamwarm)
    return train_model


In [15]:
with strategy.scope():
    model = TFBertModel.from_pretrained("bert-base-uncased")
    train_model = createmodel(model,weights_scale)
train_model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 220)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 220)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 220)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]         

In [16]:
def get_df_input(train_df,max_len):
    train_input = tokenizer(list(train_df), return_tensors='tf',padding="max_length", truncation=True, max_length=max_len)
    input_ids = train_input["input_ids"]
    attention_mask = train_input["attention_mask"]
    token_type_ids = train_input["token_type_ids"]
    return  input_ids, attention_mask, token_type_ids

In [22]:
token = ["train_input_ids","train_attention_mask","train_token_type_ids",
        "validate_input_ids","validate_attention_mask","validate_token_type_ids"]

for t in token:
    temp_path = "/kaggle/input/jigsaw-input-token/"+t+".npy"
    if  os.path.exists(temp_path):
        globals()[t] = np.load(temp_path)

In [18]:
train_aux_target = np.array(train_df[aux_columns])

# y_true : shape is n*2, cols = ['target', 'weights']
y_true = np.concatenate( [np.array(train_df["target"]).reshape(-1,1), weights.reshape(-1,1)],axis = 1)

In [19]:
train_model.fit([train_input_ids, train_attention_mask, train_token_type_ids ],[y_true,train_aux_target],batch_size = 128)

2023-01-20 13:28:24.183754: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1429459680 exceeds 10% of free system memory.
2023-01-20 13:28:27.832880: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1429459680 exceeds 10% of free system memory.
2023-01-20 13:28:30.724452: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1429459680 exceeds 10% of free system memory.




<tensorflow.python.keras.callbacks.History at 0x7f94178c05d0>

In [23]:
y_predict = train_model.predict([validate_input_ids, validate_attention_mask, validate_token_type_ids ])[0]

In [24]:
y_gt = np.array(validate_df['target'])

In [26]:
validate_df["predict"]=y_predict.flatten()
bias_metrics_df = metrics.compute_bias_metrics_for_model(validate_df, identity_columns,"target","predict")
bias_metrics_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
6,black,1614,0.877252,0.948345,0.954751
2,homosexual_gay_or_lesbian,1185,0.899905,0.935644,0.963379
7,white,2801,0.90114,0.954904,0.958379
5,muslim,1845,0.912248,0.96555,0.95146
4,jewish,545,0.919888,0.967089,0.951181
0,male,4811,0.945273,0.965258,0.967306
1,female,6743,0.949975,0.968282,0.967042
3,christian,3381,0.951369,0.974083,0.960203
8,psychiatric_or_mental_illness,490,0.957611,0.950399,0.980646


In [27]:
metrics.get_final_metric(bias_metrics_df, metrics.calculate_overall_auc(validate_df,"target","predict"))

0.9547544681799787