In [9]:
import os
import gc
import numpy as np 
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer,BertConfig,TFBertModel
from tqdm import tqdm

In [10]:
DATA_PATH = '/kaggle/input/tweet-sentiment-extraction/'
train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')
submission_df = pd.read_csv(DATA_PATH + 'sample_submission.csv')

In [11]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 32
    TEST_BATCH_SIZE = 32
    EPOCHS = 5
    BERT_CONFIG = '/kaggle/input/bert-base-uncased-config.json'
    BERT_PATH = '/kaggle/input/bert-base-uncased-huggingface-transformer/'
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f'{BERT_PATH}/bert-base-uncased-vocab.txt',
        lowercase=True
    )
    SAVEMODEL_PATH = '/kaggle/input/tftweet/finetuned_bert.h5'
    THRESHOLD = 0.4


In [15]:
def process_data(tweet, selected_text, tokenizer):
    len_st = len(selected_text)
    idx0 = None
    idx1 = None
    
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
        if tweet[ind:ind+len_st] == selected_text:
            idx0 = ind
            idx1= ind + len_st
            break
            
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1):
            char_targets[ct] = 1
            
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for i, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1:offset2]) > 0:
            target_idx.append(i)
            
    targets = [0] * len(input_ids_orig)
    
    for idx in target_idx:
        targets[idx] = 1
        
    return targets

In [17]:
train_df['targets'] = train_df.apply(lambda row: process_data(
                                                                    str(row['text']), 
                                                                    str(row['selected_text']),
                                                                    config.TOKENIZER),
                                                                    axis=1)

In [37]:
train_df['targets'] = train_df['targets'].apply(lambda x: x + [0] * (config.MAX_LEN - len(x)))

In [38]:
def _convert_to_transformer_inputs(text, tokenizer, max_sequence_length):
    inputs = tokenizer.encode(text)
    input_ids =  inputs.ids
    input_masks = inputs.attention_mask
    input_segments = inputs.type_ids
    padding_length = max_sequence_length - len(input_ids)
    padding_id = 0
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)
    return [input_ids, input_masks, input_segments]



In [39]:

def compute_input_arrays(df, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df.iterrows()):
        ids, masks, segments= _convert_to_transformer_inputs(str(instance.text),tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


In [40]:

def compute_output_arrays(df, columns):
    return np.asarray(df[columns].values.tolist())

In [41]:
outputs = compute_output_arrays(train_df, 'targets')
inputs = compute_input_arrays(train_df, config.TOKENIZER, config.MAX_LEN)
test_inputs = compute_input_arrays(test_df, config.TOKENIZER, config.MAX_LEN)

27481it [00:08, 3225.24it/s]
3534it [00:00, 3677.32it/s]


In [42]:
def create_model():
    ids = tf.keras.layers.Input((config.MAX_LEN, ), dtype=tf.int32)
    mask = tf.keras.layers.Input((config.MAX_LEN, ), dtype=tf.int32)
    attention = tf.keras.layers.Input((config.MAX_LEN, ), dtype=tf.int32)
    bert_config = BertConfig()
    bert_model = TFBertModel.from_pretrained(config.BERT_PATH + '/bert-base-uncased-tf_model.h5'
                                            , config=bert_config)
    
    output = bert_model(ids, attention_mask=mask, token_type_ids=attention)
    
    out = tf.keras.layers.Dropout(0.1)(output[0])
    out = tf.keras.layers.Conv1D(1,1)(out)
    out = tf.keras.layers.Flatten()(out)
    out = tf.keras.layers.Activation('sigmoid')(out)
    model = tf.keras.models.Model(inputs=[ids, mask, attention], outputs=out)
    
    return model

In [43]:
model = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

Some layers from the model checkpoint at /kaggle/input/bert-base-uncased-huggingface-transformer//bert-base-uncased-tf_model.h5 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at /kaggle/input/bert-base-uncased-huggingface-transformer//bert-base-uncased-tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
if not os.path.exists(config.SAVEMODEL_PATH):
    model.fit(x=inputs, y=outputs, epochs=config.EPOCHS, batch_size=config.TRAIN_BATCH_SIZE)
    model.save_weights(f'finetuned_bert.h5')
else:
    model.load_weights(config.SAVEMODEL_PATH)

Epoch 1/5
Epoch 2/5
 48/430 [==>...........................] - ETA: 5:32 - loss: 0.0758

In [None]:
test_outputs = compute_output_arrays(test_df, 'targets')

model.evalute(test_inputs, test_outputs)