[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/carloszan/bert-fine-tuning-tutorial/blob/main/create-model.ipynb)

In [None]:
!pip install transformers
!pip install datasets

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

dataset = load_dataset('ic-fspml/fpb')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

mapDict = {
    "positive": 0,
    "neutral": 1,
    "negative": 2
}

def transform_labels(label):
    label = label["label"]

    result = []
    for l in label:
      result.append(mapDict[l])


    return {"label": result} 

def tokenize_function(example):
    return tokenizer(example["sentence"], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(transform_labels, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
from transformers import TrainingArguments

output_dir = "./bert-financial-sentiment-analysis"
training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=3,  # Number of training epochs
  per_device_train_batch_size=8,  # Batch size per GPU
  per_device_eval_batch_size=8,   # Batch size for evaluation per GPU
  weight_decay=0.01,   # Strength of weight decay
  logging_dir="./logs",   # Directory for storing logs
  logging_steps=100,   # Log every N steps
  evaluation_strategy="steps",   # Evaluation strategy during training
  eval_steps=200,   # Run evaluation every N steps
  save_total_limit=2,   # Only save the last N checkpoints
  save_steps=200,   # Save checkpoint every N steps
  load_best_model_at_end=True,   # Load the best model at the end of training
  metric_for_best_model="accuracy",   # Metric to use for the best model
)


In [6]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [8]:
trainer.train()

  0%|          | 0/1455 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  7%|▋         | 101/1455 [00:17<03:36,  6.26it/s]

{'loss': 0.7072, 'learning_rate': 4.656357388316151e-05, 'epoch': 0.21}


 14%|█▎        | 200/1455 [00:33<03:18,  6.32it/s]

{'loss': 0.5398, 'learning_rate': 4.312714776632303e-05, 'epoch': 0.41}


                                                  
 14%|█▎        | 200/1455 [00:37<03:18,  6.32it/s]

{'eval_loss': 0.5136759877204895, 'eval_accuracy': 0.8072164948453608, 'eval_runtime': 4.1595, 'eval_samples_per_second': 233.202, 'eval_steps_per_second': 29.331, 'epoch': 0.41}


 21%|██        | 301/1455 [00:56<03:13,  5.97it/s]

{'loss': 0.4808, 'learning_rate': 3.9690721649484535e-05, 'epoch': 0.62}


 27%|██▋       | 400/1455 [01:13<02:55,  5.99it/s]

{'loss': 0.4619, 'learning_rate': 3.6254295532646046e-05, 'epoch': 0.82}


                                                  
 27%|██▋       | 400/1455 [01:17<02:55,  5.99it/s]

{'eval_loss': 0.4085661768913269, 'eval_accuracy': 0.831958762886598, 'eval_runtime': 4.4878, 'eval_samples_per_second': 216.143, 'eval_steps_per_second': 27.185, 'epoch': 0.82}


 34%|███▍      | 501/1455 [01:36<02:45,  5.76it/s]

{'loss': 0.408, 'learning_rate': 3.2817869415807564e-05, 'epoch': 1.03}


 41%|████      | 600/1455 [01:53<02:22,  6.02it/s]

{'loss': 0.2698, 'learning_rate': 2.9381443298969075e-05, 'epoch': 1.24}


                                                  
 41%|████      | 600/1455 [01:57<02:22,  6.02it/s]

{'eval_loss': 0.6087841391563416, 'eval_accuracy': 0.8268041237113402, 'eval_runtime': 4.494, 'eval_samples_per_second': 215.845, 'eval_steps_per_second': 27.148, 'epoch': 1.24}


 48%|████▊     | 701/1455 [02:16<02:03,  6.10it/s]

{'loss': 0.2692, 'learning_rate': 2.594501718213059e-05, 'epoch': 1.44}


 55%|█████▍    | 800/1455 [02:33<01:42,  6.37it/s]

{'loss': 0.2799, 'learning_rate': 2.2508591065292097e-05, 'epoch': 1.65}


                                                  
 55%|█████▍    | 800/1455 [02:38<01:42,  6.37it/s]

{'eval_loss': 0.6722472906112671, 'eval_accuracy': 0.8340206185567011, 'eval_runtime': 4.5422, 'eval_samples_per_second': 213.552, 'eval_steps_per_second': 26.859, 'epoch': 1.65}


 62%|██████▏   | 901/1455 [02:56<01:27,  6.37it/s]

{'loss': 0.2827, 'learning_rate': 1.9072164948453608e-05, 'epoch': 1.86}


 69%|██████▊   | 1000/1455 [03:13<01:11,  6.38it/s]

{'loss': 0.2429, 'learning_rate': 1.5635738831615122e-05, 'epoch': 2.06}


                                                   
 69%|██████▊   | 1000/1455 [03:18<01:11,  6.38it/s]

{'eval_loss': 0.7032499313354492, 'eval_accuracy': 0.8391752577319588, 'eval_runtime': 4.5142, 'eval_samples_per_second': 214.878, 'eval_steps_per_second': 27.026, 'epoch': 2.06}


 76%|███████▌  | 1101/1455 [03:36<00:59,  5.93it/s]

{'loss': 0.101, 'learning_rate': 1.2199312714776633e-05, 'epoch': 2.27}


 82%|████████▏ | 1200/1455 [03:53<00:39,  6.40it/s]

{'loss': 0.0831, 'learning_rate': 8.762886597938144e-06, 'epoch': 2.47}


                                                   
 82%|████████▏ | 1200/1455 [03:57<00:39,  6.40it/s]

{'eval_loss': 0.7949932217597961, 'eval_accuracy': 0.8474226804123711, 'eval_runtime': 4.59, 'eval_samples_per_second': 211.328, 'eval_steps_per_second': 26.579, 'epoch': 2.47}


 89%|████████▉ | 1301/1455 [04:16<00:25,  6.10it/s]

{'loss': 0.1449, 'learning_rate': 5.326460481099657e-06, 'epoch': 2.68}


 96%|█████████▌| 1400/1455 [04:33<00:09,  6.09it/s]

{'loss': 0.144, 'learning_rate': 1.8900343642611683e-06, 'epoch': 2.89}


                                                   
 96%|█████████▌| 1400/1455 [04:38<00:09,  6.09it/s]

{'eval_loss': 0.8082786202430725, 'eval_accuracy': 0.8422680412371134, 'eval_runtime': 4.5091, 'eval_samples_per_second': 215.121, 'eval_steps_per_second': 27.056, 'epoch': 2.89}


100%|██████████| 1455/1455 [04:49<00:00,  5.03it/s]

{'train_runtime': 289.4443, 'train_samples_per_second': 40.174, 'train_steps_per_second': 5.027, 'train_loss': 0.3072073579244188, 'epoch': 3.0}





TrainOutput(global_step=1455, training_loss=0.3072073579244188, metrics={'train_runtime': 289.4443, 'train_samples_per_second': 40.174, 'train_steps_per_second': 5.027, 'train_loss': 0.3072073579244188, 'epoch': 3.0})

In [11]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 122/122 [00:04<00:00, 28.09it/s]

(970, 3) (970,)





In [12]:
trainer.evaluate()

100%|██████████| 122/122 [00:04<00:00, 27.84it/s]


{'eval_loss': 0.7949932217597961,
 'eval_accuracy': 0.8474226804123711,
 'eval_runtime': 4.4203,
 'eval_samples_per_second': 219.444,
 'eval_steps_per_second': 27.6,
 'epoch': 3.0}

In [13]:
trainer.save_model()

In [14]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]
model.safetensors:   0%|          | 16.4k/438M [00:00<3:26:17, 35.4kB/s]
training_args.bin: 100%|██████████| 4.54k/4.54k [00:00<00:00, 6.91kB/s] 
model.safetensors: 100%|██████████| 438M/438M [04:59<00:00, 1.46MB/s] 
Upload 2 LFS files: 100%|██████████| 2/2 [05:00<00:00, 150.16s/it]


'https://huggingface.co/carloszansavio/bert-financial-sentiment-analysis/tree/main/'