<a href="https://colab.research.google.com/github/bonschorno/termpapers/blob/master/thesis_code/scripts/ml/first_classification_task_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
pip install transformers datasets



In [25]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/spark_data/text_sent_preprocessed.csv")

In [26]:
df.columns

Index(['Sentence', 'Text_raw', 'Tokens', 'Article_State',
       'Finished_Annotators', 'Curation', 'Onerva', 'Fride', 'Lynn',
       'Sebastian', 'Alisha', 'Fabian', 'Sentence_Start', 'Sentence_Stop',
       'Whereas', 'Front', 'Text_prep', 'unique_tags', 'all_tags',
       'count_tags', 'tag_time_polduration', 'tag_time_monitoring',
       'tag_time_resources', 'tag_time_compliance', 'tag_time_ineffect',
       'tag_authority_default', 'tag_authority_legislative',
       'tag_authority_established', 'tag_authority_monitoring',
       'tag_addressee_default', 'tag_addressee_resource',
       'tag_addressee_monitored', 'tag_addressee_sector',
       'tag_objective_quanttarget', 'tag_objective_qualintention',
       'tag_objective_quanttarget_noccm', 'tag_objective_qualintention_noccm',
       'tag_resource_monspending', 'tag_resource_monrevenues',
       'tag_resource_other', 'tag_form_sanctioning', 'tag_form_monitoring',
       'tag_ref_otherpolicy', 'tag_ref_policyamended',
       't

In [27]:
# dropping whereas and front sections
df = df[(df["Whereas"] == -1) & (df["Front"] == -1)]

# dropping empty strings
df = df[df['Text_prep'].notnull()]

## preparing datasets

In [44]:
from datasets import Features, Value, ClassLabel, Dataset
from sklearn.model_selection import train_test_split

target_var = "tag_form_monitoring"
df.rename(columns = {target_var:'label', "Text_prep": "text"}, inplace = True)
target_var_values = df["label"].values

train_df, valid_df = train_test_split(df, train_size=0.8, stratify=target_var_values, random_state=42)

class_names = ['Absent', 'Present']

ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

train_dataset = Dataset.from_pandas(train_df, features=ft)
valid_dataset = Dataset.from_pandas(valid_df, features=ft)

In [45]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 2105
})

In [46]:
# Construct class weights
import numpy as np
from sklearn.utils import class_weight

class_weights_raw = class_weight.compute_class_weight("balanced", classes=np.unique(target_var_values), y = target_var_values)
#class_weights = dict(enumerate(class_weights))
#class_weights
class_weights_list = class_weights_raw

In [47]:
import torch

class_weights = torch.from_numpy(class_weights_list).float().to("cuda")
class_weights

tensor([0.7235, 1.6187], device='cuda:0')

## model

In [61]:
from transformers import AutoTokenizer

# Standard BERT: bert-base-uncased
# DistilBERT: distilbert-base-uncased
# RoBERTa: roberta-base
# ClimateBERT: climatebert/distilroberta-base-climate-f
# Mini LM

checkpoint = "microsoft/MiniLM-L12-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at /root/.c

In [62]:
def tokenize_function(x): return tokenizer(x["text"], truncation=True, max_length=512)

train_dataset_tok = train_dataset.map(tokenize_function, batched=True)
valid_dataset_tok = valid_dataset.map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [63]:
train_dataset_tok = train_dataset_tok.rename_column("label", "labels")
valid_dataset_tok = valid_dataset_tok.rename_column("label", "labels")

In [64]:
train_dataset_tok

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2105
})

## customize the trainer

In [65]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (with two labels)
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

## choose the appropriate model

In [66]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from c

## specify training arguments

## set up data collator



In [67]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## customize metrics

In [68]:
from sklearn.metrics import f1_score

def new_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds)
  return {"F1-Score": f1}

In [69]:
#from datasets import load_metric, list_metrics


#def compute_metrics(eval_pred):
#    accuracy_score = load_metric("accuracy")
#    f1_score = load_metric("f1")
#    precision_score = load_metric("precision")
#    recall_score = load_metric("recall")
#    
#    logits, labels = eval_pred
#    predictions = np.argmax(logits, axis=-1)
#    precision = precision_score.compute(predictions=predictions, references=labels)["precision"]
#    recall = recall_score.compute(predictions=predictions, references=labels)["recall"]
#    accuracy = accuracy_score.compute(predictions=predictions, references=labels)["accuracy"]
#    f1 = accuracy_score.compute(predictions=predictions, references=labels)["f1"]
#    return {"Accuracy": accuracy, "F1": f1, "Precision": precision, "Recall": recall}

## define training arguments

In [70]:
from transformers import TrainingArguments

batch_size = 32
# Log the training loss at each epoch
logging_steps = len(train_dataset_tok) // batch_size
output_dir = "/content/drive/MyDrive/spark_data/"

training_arguments = TrainingArguments(output_dir = output_dir, 
                                       logging_strategy = "epoch",
                                       #logging_steps = logging_steps,
                                       evaluation_strategy = "epoch",
                                       fp16=True, 
                                       report_to="all")

PyTorch: setting up devices


## set up trainer

In [71]:
trainer = CustomTrainer(
    model,
    compute_metrics = new_metrics,
    args = training_arguments,
    train_dataset=train_dataset_tok,
    eval_dataset=valid_dataset_tok,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using amp half precision backend


## model training

In [72]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2105
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 792


Epoch,Training Loss,Validation Loss,F1-score
1,0.5061,0.499668,0.714801
2,0.3458,0.332958,0.829132
3,0.2453,0.337264,0.844828


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 527
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/spark_data/checkpoint-500
Configuration saved in /content/drive/MyDrive/spark_data/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/spark_data/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/spark_data/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/spark_data/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 527
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceCl

TrainOutput(global_step=792, training_loss=0.36573368611961904, metrics={'train_runtime': 456.0548, 'train_samples_per_second': 13.847, 'train_steps_per_second': 1.737, 'total_flos': 118213245862320.0, 'train_loss': 0.36573368611961904, 'epoch': 3.0})