# Argument Mining - Optuna

Hyperparams optimization with Optuna for AM task 1.

In [1]:
!python --version

Python 3.10.12


## Create a Bert-based pre-training model

In [2]:
!pip install optuna



In [3]:
import tensorflow as tf
tf.autograph.set_verbosity(0)
tf.random.set_seed(2)

In [4]:
# !pip install tf_keras
# %env TF_USE_LEGACY_KERAS=1

In [5]:
!pip install --upgrade transformers
import transformers



In [6]:
import keras
print(tf.__version__)
print(keras.__version__)
print(transformers.__version__)

2.15.0
2.15.0
4.39.0


In [7]:
from transformers import TFBertForSequenceClassification, BertTokenizer

In [8]:
hf_model_name = "dccuchile/bert-base-spanish-wwm-cased"
model = TFBertForSequenceClassification.from_pretrained(hf_model_name, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(hf_model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

## Dataset

In [10]:
# Mount Drive files
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd

def load_sst_data(path: str):
  data = pd.read_csv(path)
  return data

drive_path = "drive/My Drive/Datasets/dm-2019/"
dataset_home = drive_path + "data/dm-2019-annotated.csv"
dataset = load_sst_data(dataset_home)
dataset

Unnamed: 0,sent_id,sent_text,sent_label1,sent_label2,sent_label3
0,109-0-0,Limpieza de graffitis y remodelación de aluche,YES,CLAIM,NONE
1,109-0-0,Los vecinos de Aluche vemos día tras día como ...,YES,CLAIM,NONE
2,109-0-0,Los vecinos de Aluche vemos día tras día como ...,YES,CLAIM,NONE
3,109-0-1,"Graffitis y basura por todas partes, aceras ro...",YES,CLAIM,NONE
4,109-17276-0,No a los ruidos.,YES,CLAIM,NONE
...,...,...,...,...,...
3249,89-78258-1,Os recuerdo que España es el país de la OCDE d...,YES,PREMISE,EXPLANATION
3250,89-78258-2,La desigualdad en España avanza a un ritmo que...,YES,PREMISE,EXPLANATION
3251,89-78258-3,La pobreza y la exclusión en España han aument...,NO,SPAM,NONE
3252,89-78258-4,"Por otra lado, estamos a la espera de que nos ...",NO,SPAM,NONE


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

def split_data(df, target_column, test_size=0.2, validation_size=0.1, random_state=None):

  # Split the data into a temporary train set and test/validation set
  train, temp = train_test_split(df, test_size=(test_size + validation_size), stratify=df[target_column], random_state=random_state)

  # Split the temp set into validation and test sets
  test, validation = train_test_split(temp, test_size=validation_size / (test_size + validation_size), stratify=temp[target_column], random_state=random_state)

  return train, validation, test

In [13]:
label_column = "sent_label2"
t2_label_dict = {"SPAM": 0, "CLAIM": 1, "PREMISE": 2}
t2_num_labels = len(t2_label_dict)

t2_data = dataset[["sent_text", label_column]]
t2_data = t2_data.replace({label_column: t2_label_dict})

training_set, validation_set, test_set = split_data(t2_data, target_column=label_column, test_size=0.1, validation_size=0.1, random_state=42)

In [14]:
# Shuffle dataset
training_set = shuffle(training_set)
validation_set = shuffle(validation_set)
test_set = shuffle(test_set)

# Obtain text and label vectors
train_texts = training_set["sent_text"]
train_labels = training_set[label_column]

validation_texts = validation_set["sent_text"]
validation_labels = validation_set[label_column]

test_texts = test_set["sent_text"]
test_labels = test_set[label_column]

print("Training size: {}".format(len(training_set)))
print("Validation size: {}".format(len(validation_set)))
print("Test size: {}".format(len(test_set)))

Training size: 2603
Validation size: 326
Test size: 325


In [15]:
from transformers import InputFeatures

def convert_examples_to_features(texts, labels):
  labels = list(labels)
  batch_encoding = tokenizer.batch_encode_plus(texts, max_length=128, padding='longest')

  features = []
  for i in range(len(texts)):
    inputs = {k: batch_encoding[k][i] for k in batch_encoding}
    feature = InputFeatures(**inputs, label=labels[i])
    features.append(feature)

  return features

def convert_features_to_tf_dataset(features):
  def gen():
    for ex in features:
      yield ({
          "input_ids": ex.input_ids,
          "attention_mask": ex.attention_mask,
          "token_type_ids": ex.token_type_ids
          }, ex.label)

  output_types = ({
      "input_ids": tf.int64,
      "attention_mask": tf.int64,
      "token_type_ids": tf.int64
      }, tf.int64)

  output_shapes = ({
      "input_ids": tf.TensorShape([None]),
      "attention_mask": tf.TensorShape([None]),
      "token_type_ids": tf.TensorShape([None])
      }, tf.TensorShape([]))

  dataset = tf.data.Dataset.from_generator(gen, output_types, output_shapes)
  return dataset

In [16]:
batch_size = 32

In [17]:
train_features = convert_examples_to_features(train_texts, train_labels)
train_dataset = convert_features_to_tf_dataset(train_features)

validation_features = convert_examples_to_features(validation_texts, validation_labels)
validation_dataset = convert_features_to_tf_dataset(validation_features)

train_dataset = train_dataset.shuffle(100).batch(batch_size)
validation_dataset = validation_dataset.batch(batch_size)

## Introduce Optuna

In [18]:
import optuna

In [19]:
def objective(trial):
  model = TFBertForSequenceClassification.from_pretrained(hf_model_name, num_labels=t2_num_labels)

  # Adjustable hyperparameters
  hp_learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-3, log=True)
  hp_epsilon = trial.suggest_float("epsilon", 1e-9, 1e-5, log=True)
  hp_epochs = trial.suggest_int("epochs", 2, 5, step=1)
  optimizer = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate, epsilon=hp_epsilon, clipnorm=1.0)

  # Fixed hyperparameters
  loss = tf.keras.losses.SparseCategoricalCrossentropy()
  metric1 = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

  model.compile(optimizer=optimizer, loss=loss, metrics=[metric1])

  # Train and evaluate using tf.keras.Model.fit()
  history = model.fit(train_dataset, validation_data=validation_dataset, epochs=hp_epochs)

  return history.history['val_accuracy'][-1]

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

[I 2024-03-21 20:51:45,047] A new study created in memory with name: no-name-e7c6428e-95c7-4aca-85aa-cd0225e774a9
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


Cause: for/else statement not yet supported


Epoch 2/2


[I 2024-03-21 20:57:02,925] Trial 0 finished with value: 0.2822085916996002 and parameters: {'learning_rate': 0.0004379676827514459, 'epsilon': 2.7454366117869085e-08, 'epochs': 2}. Best is trial 0 with value: 0.2822085916996002.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[I 2024-03-21 21:07:29,027] Trial 1 finished with value: 0.16564416885375977 and parameters: {'learning_rate': 0.0006254538230292556, 'epsilon': 1.7500443967191364e-09, 'epochs': 5}. Best is trial 1 with value: 0.16564416885375977.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


[I 2024-03-21 21:15:18,773] Trial 2 finished with value: 0.2822085916996002 and parameters: {'learning_rate': 1.2621499617343738e-05, 'epsilon': 6.413422956994363e-09, 'epochs': 3}. Best is trial 1 with value: 0.16564416885375977.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[I 2024-03-21 21:25:05,001] Trial 3 finished with value: 0.30674847960472107 and parameters: {'learning_rate': 6.5309934023677404e-06, 'epsilon': 1.4908228994088039e-06, 'epochs': 5}. Best is trial 1 with value: 0.16564416885375977.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


[I 2024-03-21 21:29:52,348] Trial 4 finished with value: 0.48159509897232056 and parameters: {'learning_rate': 3.761019225993355e-05, 'epsilon': 1.440772919214087e-07, 'epochs': 2}. Best is trial 1 with value: 0.16564416885375977.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value:", trial.value)
print("  Best params:", study.best_params)
print("  Params:")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))