<a href="https://colab.research.google.com/github/aaposyvanen/emodim/blob/master/sentence-analysis/fine_tune_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load FinBERT from huggingface

In [None]:
!pip install transformers

from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import numpy as np
device_name = tf.test.gpu_device_name()
print(f"Device name: {device_name}")
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", cache_dir="finBERT/")
model = TFAutoModel.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1", cache_dir="finBERT/")
# model.summary()

# Download trainingdata

In [88]:
import requests
file_names = ['combinedneg.txt', 'combinedneut2.txt', 'combinedpos.txt']
sentences = []
labels = []
for i, file in enumerate(file_names):
  r = requests.get(f"https://raw.githubusercontent.com/aaposyvanen/emodim/master/data/tr/{file}")
  sentences.extend(r.text.split('\n'))
  labels.extend([i]*len(r.text.split('\n')))
np.array(sentences)

34140

In [89]:
print(' Original: ', sentences[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  ei mitään muttia.
Tokenized:  ['ei', 'mitään', 'mut', '##tia', '.']
Token IDs:  [193, 642, 1851, 570, 111]


In [91]:
max_len = 0
for sent in sentences:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  87


In [92]:
# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
      sent,                         # Sentence to encode.
      add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
      max_length = max_len,         # Pad & truncate all sentences.
      pad_to_max_length = True,
      return_attention_mask = True, # Construct attn. masks.
      return_tensors = 'tf'         # Return tensorflow tensors.
      )
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  ei mitään muttia.
Token IDs: tf.Tensor(
[[ 102  193  642 1851  570  111  103    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]], shape=(1, 87), dtype=int32)


In [None]:
def create_model(num_labels):
    model_layers = tf.keras.Sequential([
        encoder,
        tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=32, embeddings_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4), mask_zero=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.1, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.1)),
        tf.keras.layers.Dropout(0.10),
        tf.keras.layers.Dense(64, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4), activation='relu'),
        tf.keras.layers.Dropout(0.10),
        tf.keras.layers.Dense(num_labels),
        tf.keras.layers.Activation('softmax')])
    return model_layers

In [None]:
fine_tuning_model = create_model(num_labels=3)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)
fine_tuning_model.compile(
    optimizer=tf.keras.optimizers.Adamax(0.005),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False),
    metrics=['accuracy'])
history = model.fit(train_dataset, validation_data=test_dataset, callbacks=[early_stop], epochs=30)
fine_tuning_model.summary()