In [1]:
from datasets import load_dataset, Dataset, DatasetDict 
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd

dataset = load_dataset('pauri32/fiqa-2018')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 961
    })
    validation: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 102
    })
    test: Dataset({
        features: ['sentence', 'snippets', 'target', 'sentiment_score', 'aspects', 'format', 'label'],
        num_rows: 150
    })
})

In [3]:
from datasets import load_dataset, Dataset, DatasetDict 
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd

dataset = load_dataset('pauri32/fiqa-2018')

In [4]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], padding=True)

remove_columns = ['snippets', 'target', 'sentiment_score', 'aspects', 'format']
tokenized_datasets = dataset.map(tokenize_function, remove_columns=remove_columns, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
from transformers import TrainingArguments

output_dir = "./bert-financial-sentiment-analysis"
training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=3,  # Number of training epochs
  per_device_train_batch_size=8,  # Batch size per GPU
  per_device_eval_batch_size=8,   # Batch size for evaluation per GPU
  weight_decay=0.01,   # Strength of weight decay
  logging_dir="./logs",   # Directory for storing logs
  logging_steps=100,   # Log every N steps
  evaluation_strategy="steps",   # Evaluation strategy during training
  eval_steps=200,   # Run evaluation every N steps
  save_total_limit=2,   # Only save the last N checkpoints
  save_steps=200,   # Save checkpoint every N steps
  load_best_model_at_end=True,   # Load the best model at the end of training
  metric_for_best_model="accuracy",   # Metric to use for the best model
)


In [6]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [8]:
trainer.train()

  0%|          | 0/363 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 28%|██▊       | 102/363 [00:10<00:25, 10.28it/s]

{'loss': 0.8795, 'learning_rate': 1.25e-05, 'epoch': 0.83}


 55%|█████▌    | 200/363 [00:20<00:16,  9.64it/s]

{'loss': 0.6426, 'learning_rate': 2.5e-05, 'epoch': 1.65}


                                                 
 55%|█████▌    | 200/363 [00:20<00:16,  9.64it/s]

{'eval_loss': 1.2002240419387817, 'eval_accuracy': 0.6372549019607843, 'eval_runtime': 0.213, 'eval_samples_per_second': 478.818, 'eval_steps_per_second': 61.026, 'epoch': 1.65}


 83%|████████▎ | 302/363 [00:33<00:05, 10.17it/s]

{'loss': 0.3942, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.48}


100%|██████████| 363/363 [00:39<00:00,  9.24it/s]


{'train_runtime': 39.3064, 'train_samples_per_second': 73.347, 'train_steps_per_second': 9.235, 'train_loss': 0.6004160573659849, 'epoch': 3.0}


TrainOutput(global_step=363, training_loss=0.6004160573659849, metrics={'train_runtime': 39.3064, 'train_samples_per_second': 73.347, 'train_steps_per_second': 9.235, 'train_loss': 0.6004160573659849, 'epoch': 3.0})

In [9]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 64.98it/s]

(102, 3) (102,)





In [10]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 66.72it/s]


{'eval_loss': 1.2002240419387817,
 'eval_accuracy': 0.6372549019607843,
 'eval_runtime': 0.2149,
 'eval_samples_per_second': 474.643,
 'eval_steps_per_second': 60.494,
 'epoch': 3.0}

In [11]:
trainer.save_model()

KeyboardInterrupt: 

In [None]:
trainer.push_to_hub()

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-65501de1-3b6faa621d39682373dbf4d8;d0d6b56a-d48d-472b-b4cc-628c3463236b)

Repository Not Found for url: https://huggingface.co/api/models/output-fiqa.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.