In [212]:
pip install transformers datasets evaluate



In [213]:
pip install accelerate -U



In [214]:
pip install optuna ray[tune]

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ray[tune]
  Downloading ray-2.6.3-cp310-cp310-manylinux2014_x86_64.whl (56.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.3-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7

In [215]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [235]:
INPUT_EN = '/content/drive/My Drive/Projects/esg/ML-ESG-2_English_Train.json'
INPUT_FR = '/content/drive/My Drive/Projects/esg/ML-ESG-2_French_Train.json'


"""
BestRun(
  run_id='4',
  objective=0.7906976744186046,
  hyperparameters={
    'learning_rate': 1.1484099793559503e-05,
    'num_train_epochs': 4,
    'seed': 8,
    'per_device_train_batch_size': 8
  },
  run_summary=None)
"""
MODEL_CHECKPOINT = 'distilbert-base-uncased'
BATCH_SIZE = 8
LEARNING_RATE = 1.15e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 4
TRAINING_SEED = 8

In [249]:
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments,
                          Trainer,
                          TextClassificationPipeline)
from datasets import (load_dataset, load_metric,
                      Dataset, Features, Value, ClassLabel)
import evaluate
import random
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [237]:
df = pd.read_json(INPUT_EN)
df = df.rename(
    columns={
        'news_content': 'sentence',
        'impact_type': 'label'
    }
)[['sentence', 'label']]
df.head()

Unnamed: 0,sentence,label
0,ESG-focused financial technology company Arabe...,Opportunity
1,The company also announced the appointment of ...,Opportunity
2,Wong said: \n“Personalised portfolios demand ...,Opportunity
3,One of the key themes of the report is the imp...,Opportunity
4,Europe’s three primary financial regulatory ag...,Opportunity


In [238]:
class_names = ['Opportunity', 'Risk']
esg_features = Features({
    'sentence': Value('string'),
    'label': ClassLabel(names=class_names)
})

ds = Dataset.from_pandas(
    df,
    features=esg_features
)
ds = ds.train_test_split(test_size=0.2, stratify_by_column='label', seed=1337)
metric = evaluate.load('f1')
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

In [239]:
ds['train'][0]

{'sentence': 'These same challenges are very familiar to managers in mid-sized financial institutions, where inadequate resources and scarcity of subject matter expertise are often cited as barriers to progress.',
 'label': 1}

In [240]:
sample_content = ds['train'][0]['sentence']
tokenizer(sample_content)

{'input_ids': [101, 2122, 2168, 7860, 2024, 2200, 5220, 2000, 10489, 1999, 3054, 1011, 7451, 3361, 4896, 1010, 2073, 14710, 4219, 1998, 11228, 12972, 1997, 3395, 3043, 11532, 2024, 2411, 6563, 2004, 13500, 2000, 5082, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [241]:
def preprocess_content(examples):
  return tokenizer(examples['sentence'], truncation=True)

preprocess_content(ds['train'][:5])

{'input_ids': [[101, 2122, 2168, 7860, 2024, 2200, 5220, 2000, 10489, 1999, 3054, 1011, 7451, 3361, 4896, 1010, 2073, 14710, 4219, 1998, 11228, 12972, 1997, 3395, 3043, 11532, 2024, 2411, 6563, 2004, 13500, 2000, 5082, 1012, 102], [101, 2750, 3278, 5245, 5963, 2013, 9387, 1998, 2060, 15169, 1011, 4208, 2967, 26239, 1996, 6378, 2004, 25963, 1998, 4675, 21572, 26638, 1010, 2009, 2001, 23575, 2011, 1996, 2079, 2140, 2101, 2008, 2095, 1012, 1999, 1037, 2582, 6271, 2000, 9686, 2290, 1011, 4208, 9387, 1010, 1996, 2079, 2140, 2036, 3843, 3513, 4953, 24540, 6830, 1010, 4254, 2075, 1996, 3754, 1997, 5211, 10489, 2000, 5326, 15169, 3289, 2083, 2037, 10518, 1010, 1998, 9104, 2008, 24540, 6830, 2006, 9686, 2290, 3314, 2003, 2025, 1999, 1996, 5426, 1997, 9387, 1012, 102], [101, 3121, 2024, 1037, 3145, 3120, 1997, 3795, 16635, 3806, 1006, 1043, 25619, 1007, 11768, 1010, 1998, 2036, 2028, 1997, 1996, 18263, 2000, 5672, 1010, 2445, 2037, 2146, 1011, 2744, 3267, 1012, 2429, 2000, 1043, 8873, 1010, 2885

In [242]:
enc_ds = ds.map(preprocess_content, batched=True)

Map:   0%|          | 0/646 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

In [243]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [244]:
model_name = 'distilbert'

args = TrainingArguments(
    f'{model_name}-finetuned-esg1',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    seed=TRAINING_SEED,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=False
)

In [245]:
def compute_eval(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references=labels)

In [246]:
trainer = Trainer(
    model,
    args,
    train_dataset=enc_ds['train'],
    eval_dataset=enc_ds['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_eval
)

In [247]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.300675,0.0
2,No log,0.193248,0.628571
3,No log,0.187165,0.717949
4,No log,0.176787,0.780488


TrainOutput(global_step=324, training_loss=0.2230552390769676, metrics={'train_runtime': 53.5878, 'train_samples_per_second': 48.22, 'train_steps_per_second': 6.046, 'total_flos': 89825832558168.0, 'train_loss': 0.2230552390769676, 'epoch': 4.0})

In [248]:
trainer.evaluate()

{'eval_loss': 0.17678749561309814,
 'eval_f1': 0.7804878048780488,
 'eval_runtime': 0.7385,
 'eval_samples_per_second': 219.365,
 'eval_steps_per_second': 28.436,
 'epoch': 4.0}

In [257]:
text = ["More than 1 billion car tires reach the end of their life each year, and dealing with the resulting waste is an escalating management headache the world over.",
        "Most large-scale production of wheat relies on synthetic fertilizer, which contributes to climate change, algae blooms, and oceanic “dead zones” when nutrients from these fertilizers run off into the environment.",
        "Trained on data on how species interact with each other, the model could serve to alert conservation managers on which vulnerable species to focus on, the developers say."]
pipe = TextClassificationPipeline(model=model.to('cpu'), tokenizer=tokenizer)
pipe(text)

[{'label': 'LABEL_1', 'score': 0.8178699016571045},
 {'label': 'LABEL_1', 'score': 0.6433500647544861},
 {'label': 'LABEL_0', 'score': 0.9831188321113586}]

In [230]:
# def model_init():
#   return AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

In [231]:
# trainer=Trainer(
#     model_init=model_init,
#     args=args,
#     train_dataset=enc_ds['train'],
#     eval_dataset=enc_ds['test'],
#     tokenizer=tokenizer,
#     compute_metrics=compute_eval
# )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [232]:
# best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[I 2023-08-23 08:21:33,764] A new study created in memory with name: no-name-ba307b75-47a7-43e9-9c62-804ef07b830d
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.707191,0.260355


[I 2023-08-23 08:21:48,161] Trial 0 finished with value: 0.2603550295857988 and parameters: {'learning_rate': 1.1261073158655119e-06, 'num_train_epochs': 1, 'seed': 14, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.2603550295857988.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.324195,0.466667
2,No log,0.28266,0.685714
3,No log,0.21614,0.717949


[I 2023-08-23 08:22:35,294] Trial 1 finished with value: 0.717948717948718 and parameters: {'learning_rate': 1.355329725349153e-05, 'num_train_epochs': 3, 'seed': 32, 'per_device_train_batch_size': 4}. Best is trial 1 with value: 0.717948717948718.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.198927,0.75
2,No log,0.23248,0.740741
3,No log,0.232256,0.75


[I 2023-08-23 08:23:16,826] Trial 2 finished with value: 0.75 and parameters: {'learning_rate': 8.833178405417355e-05, 'num_train_epochs': 3, 'seed': 18, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 0.75.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.594517,0.0
2,No log,0.562401,0.0
3,No log,0.542095,0.0
4,No log,0.534723,0.0


[I 2023-08-23 08:24:11,960] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 2.3787635824128504e-06, 'num_train_epochs': 4, 'seed': 33, 'per_device_train_batch_size': 64}. Best is trial 2 with value: 0.75.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.282685,0.0
2,No log,0.212216,0.684211
3,No log,0.203579,0.75
4,No log,0.191298,0.790698


[I 2023-08-23 08:25:05,346] Trial 4 finished with value: 0.7906976744186046 and parameters: {'learning_rate': 1.1484099793559503e-05, 'num_train_epochs': 4, 'seed': 8, 'per_device_train_batch_size': 8}. Best is trial 4 with value: 0.7906976744186046.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.275714,0.0


[I 2023-08-23 08:25:16,006] Trial 5 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.667115,0.277778
2,No log,0.608665,0.0
3,No log,0.567612,0.0
4,No log,0.54297,0.0
5,No log,0.534512,0.0


[I 2023-08-23 08:26:21,189] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 1.6818534201311035e-06, 'num_train_epochs': 5, 'seed': 24, 'per_device_train_batch_size': 32}. Best is trial 4 with value: 0.7906976744186046.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.293197,0.0


[I 2023-08-23 08:26:31,938] Trial 7 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.294779,0.0


[I 2023-08-23 08:26:43,130] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.401409,0.0


[I 2023-08-23 08:26:56,592] Trial 9 pruned. 


In [233]:
# best_run

BestRun(run_id='4', objective=0.7906976744186046, hyperparameters={'learning_rate': 1.1484099793559503e-05, 'num_train_epochs': 4, 'seed': 8, 'per_device_train_batch_size': 8}, run_summary=None)

In [234]:
# for n, v in best_run.hyperparameters.items():
#     setattr(trainer.args, n, v)

# trainer.train()

FrozenInstanceError: ignored