In [None]:
%%capture
! pip install transformers datasets evaluate
! pip install accelerate
! pip install --upgrade accelerate
! pip install huggingface_hub
! pip install wandb

In [None]:
from huggingface_hub import notebook_login
import wandb
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import load_dataset, ClassLabel, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, get_linear_schedule_with_warmup
from transformers.optimization import AdamW
from df_processing import balance_via_undersampling

In [None]:
# Login to HuggingFace
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Login to WandB
wandb.login()

%env WANDB_PROJECT=distilBERT-on-finBERT
%env WANDB_LOG_MODEL=true

[34m[1mwandb[0m: Currently logged in as: [33mchristian-159[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=distilBERT-on-finBERT
env: WANDB_LOG_MODEL=true


In [None]:
# Load dataset
df = pd.read_csv(r"/content/drive/MyDrive/masterProject/av_train.csv")

# Balance dataset via undersampling
df = balance_via_undersampling(df, "finBERT")
print(df.value_counts(subset="finBERT"))

# Load dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("finBERT", "label")
dataset = dataset.rename_column("summary", "text")

# Define labels
labels = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Split into 80% training and 20% validation
dataset = dataset.train_test_split(train_size=0.8)

# Tokenize dataset using distilbert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    examples["label"] = labels.str2int(examples["label"])
    return tokenizer(examples["text"], padding=True, truncation=True)


tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

# Convert to PyTorch tensors for faster training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load model and specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", id2label=id2label, label2id=label2id, num_labels=3)

finBERT
negative    1069
neutral     1069
positive    1069
dtype: int64


Map:   0%|          | 0/2565 [00:00<?, ? examples/s]

Map:   0%|          | 0/642 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set hyperparameters
training_args = TrainingArguments(
    output_dir="distilBERT-on-finBERT",
    run_name="AdamW-linear-schedule",
    learning_rate=3e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    weight_decay=0,
    evaluation_strategy="steps",
    push_to_hub=True,
    hub_private_repo=True,
    logging_steps=1,
    eval_steps=1,
    save_steps=1,
    max_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # greater_is_better=True,
    report_to="wandb"
)

# Evaluate model
def compute_metrics(eval_preds):
  metrics = dict()

  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1")

  logits = eval_preds.predictions
  labels = eval_preds.label_ids
  preds = np.argmax(logits, axis=-1)

  metrics.update(accuracy_metric.compute(predictions=preds, references=labels))
  metrics.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))

  return metrics

# Define early stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.1)

# Define optimizer
# optimizer = AdamW(model.parameters(), lr=5e-5)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=50)

# Create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
    # optimizers=(optimizer, scheduler)
)

/content/distilBERT-on-finBERT is already a clone of https://huggingface.co/c-b123/distilBERT-on-finBERT. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
# Fine-tune model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
1,1.1105,1.098372,0.325545,0.159904
2,1.104,1.088889,0.426791,0.392718
3,1.0951,1.081093,0.528037,0.500203
4,1.0773,1.07317,0.566978,0.536941
5,1.0771,1.063489,0.584112,0.554827
6,1.0687,1.05086,0.604361,0.594297
7,1.0498,1.03486,0.624611,0.626084
8,1.0275,1.015752,0.5919,0.595125
9,1.0231,0.994227,0.588785,0.587744
10,1.0116,0.96988,0.629283,0.632664


/content/distilBERT-on-finBERT is already a clone of https://huggingface.co/c-b123/distilBERT-on-finBERT. Make sure you pull the latest changes with `repo.git_pull()`.


TrainOutput(global_step=55, training_loss=0.6571723634546454, metrics={'train_runtime': 1626.5065, 'train_samples_per_second': 15.739, 'train_steps_per_second': 0.123, 'total_flos': 333940226281380.0, 'train_loss': 0.6571723634546454, 'epoch': 2.62})

In [None]:
# Evaluate model
trainer.evaluate()

# Upload the model to the Hub
trainer.push_to_hub()

To https://huggingface.co/c-b123/distilBERT-on-finBERT
   2e1d830..e0b0ab7  main -> main

   2e1d830..e0b0ab7  main -> main

