<a href="https://colab.research.google.com/github/advapplab/esgBERT_hf/blob/main/esgBERT_Finetuning_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!pip install transformers
!pip install huggingface_hub
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 01 Load Dataset

In [48]:

! wget -nc https://raw.githubusercontent.com/advapplab/esgBERT_hf/main/data/carbon_emissions.csv
! wget -nc https://raw.githubusercontent.com/advapplab/esgBERT_hf/main/data/community_relations.csv

File ‘carbon_emissions.csv’ already there; not retrieving.

File ‘community_relations.csv’ already there; not retrieving.



In [49]:
import pandas as pd

ce_pd = pd.read_csv('/content/carbon_emissions.csv', header='infer')
cr_pd = pd.read_csv('/content/community_relations.csv', header='infer')

header = ['Label', 'Source', 'Sentence']

ce_pd = ce_pd[header].dropna()
cr_pd = cr_pd[header].dropna()

ce_pd['label'] = 1
cr_pd['label'] = 0

training_pd = ce_pd.append(cr_pd)
training_pd = training_pd.reset_index()

In [50]:
training_pd

Unnamed: 0,index,Label,Source,Sentence,label
0,0,Greenhouse Gas,HOYA,The HOYA Group is globally conducting its envi...,1
1,1,Greenhouse Gas,HOYA,The HOYA Group also sets the following Group-w...,1
2,2,Greenhouse Gas,HOYA,"That is, the HOYA Group aims to achieve a 5-pe...",1
3,3,Greenhouse Gas,HOYA,The CO2 emissions of the HOYA Group in the fis...,1
4,4,Greenhouse Gas,HOYA,The reduction of CO2 emissions was 19.3 percen...,1
...,...,...,...,...,...
2634,1371,Community Relations,2021-DOW-ESG,"Outside of the United States, similar committe...",0
2635,1372,Community Relations,2021-DOW-ESG,Dow’s Government Affairs organization also dev...,0
2636,1373,Community Relations,2021-DOW-ESG,"Additionally, we proactively engage with key p...",0
2637,1374,Social and Relationship Capital,2021-Intel-10K,We are committed to engaging in corporate resp...,0


In [51]:
from datasets import Dataset, DatasetDict


esg_dict = {'train' : Dataset.from_pandas(training_pd),
              'eval' : Dataset.from_pandas(training_pd)}

esg_ds = DatasetDict(esg_dict)

# 02 Tokenizing

In [52]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_auth_token=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file tokenize

In [53]:

def tokenize_function(examples):
    return tokenizer(examples["Sentence"], 
                     padding="max_length", 
                     truncation=True, 
                     max_length=102)

tokenized_datasets = esg_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/2639 [00:00<?, ? examples/s]

Map:   0%|          | 0/2639 [00:00<?, ? examples/s]

In [54]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['index', 'Label', 'Source', 'Sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2639
    })
    eval: Dataset({
        features: ['index', 'Label', 'Source', 'Sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2639
    })
})

In [55]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["eval"]

# 03 Finetuning

In [56]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/pytorch_model.b

In [57]:

from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [58]:

from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=full_train_dataset, eval_dataset=full_eval_dataset
)

In [59]:

from transformers import TrainingArguments

training_args = TrainingArguments(num_train_epochs=2,
                                  output_dir="output")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [60]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [61]:

from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=full_train_dataset, 
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)
     

In [62]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Source, Label, index, Sentence. If Source, Label, index, Sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2639
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 660
  Number of trainable parameters = 108311810


Step,Training Loss
500,0.2028


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=660, training_loss=0.17008413257020893, metrics={'train_runtime': 126.8338, 'train_samples_per_second': 41.613, 'train_steps_per_second': 5.204, 'total_flos': 276655108045680.0, 'train_loss': 0.17008413257020893, 'epoch': 2.0})

In [63]:
import numpy as np

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Source, Label, index, Sentence. If Source, Label, index, Sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2639
  Batch size = 8


{'eval_loss': 0.028215287253260612,
 'eval_accuracy': 0.993179234558545,
 'eval_runtime': 17.8004,
 'eval_samples_per_second': 148.255,
 'eval_steps_per_second': 18.539,
 'epoch': 2.0}

# 04 Push to HF

In [None]:

!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 

In [None]:
model_output = "owen198/esgbert"

In [None]:
# tokenizer.save_pretrained(model_output)
# model.save_pretrained(model_output)

In [None]:
tokenizer.push_to_hub(model_output)
model.push_to_hub(model_output)