In [1]:
%%capture
! pip install transformers datasets evaluate
! pip install accelerate
! pip install --upgrade accelerate
! pip install huggingface_hub
! pip install wandb

In [3]:
from huggingface_hub import notebook_login
import wandb
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import load_dataset, ClassLabel, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, get_linear_schedule_with_warmup, get_constant_schedule
from transformers.optimization import AdamW
from df_processing import balance_via_undersampling, add_relative_return_ordinal

In [4]:
# Login to HuggingFace
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Login to WandB
wandb.login()

%env WANDB_PROJECT=distilBERT-on-market
%env WANDB_LOG_MODEL=true

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=distilBERT-on-market
env: WANDB_LOG_MODEL=true


In [6]:
# Load dataset
df = pd.read_csv(r"/content/drive/MyDrive/masterProject/av_train.csv")

df = add_relative_return_ordinal(df, "sp_25_pct", "sp_75_pct")

# Balance dataset via undersampling
df = balance_via_undersampling(df, "relative_return")
print(df.value_counts(subset="relative_return"))

# Load dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("relative_return", "label")
dataset = dataset.rename_column("summary", "text")

# Define labels
labels = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Split into 80% training and 20% validation
dataset = dataset.train_test_split(train_size=0.8)

# Tokenize dataset using distilbert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    examples["label"] = labels.str2int(examples["label"])
    return tokenizer(examples["text"], padding=True, truncation=True)


tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

# Convert to PyTorch tensors for faster training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load model and specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", id2label=id2label, label2id=label2id, num_labels=3)

relative_return
negative    1062
neutral     1062
positive    1062
dtype: int64


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2548 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Set hyperparameters
training_args = TrainingArguments(
    output_dir="distilBERT-on-market",
    run_name="Adam",
    learning_rate=3e-05,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    # weight_decay=0.25,
    evaluation_strategy="steps",
    push_to_hub=True,
    hub_private_repo=True,
    logging_steps=1,
    eval_steps=1,
    save_steps=1,
    max_steps=80,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # greater_is_better=True,
    report_to="wandb"
)

# Evaluate model
def compute_metrics(eval_preds):
  metrics = dict()

  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1")

  logits = eval_preds.predictions
  labels = eval_preds.label_ids
  preds = np.argmax(logits, axis=-1)

  metrics.update(accuracy_metric.compute(predictions=preds, references=labels))
  metrics.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))

  return metrics

# Define early stopping
# early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=15, early_stopping_threshold=0.1)

# Define optimizer
# optimizer = AdamW(model.parameters(), lr=1e-4)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=50)
# scheduler = get_constant_schedule(optimizer)

# Create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    # optimizers=(optimizer, scheduler)
)

Cloning https://huggingface.co/c-b123/distilBERT-on-market into local empty directory.


Download file pytorch_model.bin:   0%|          | 17.4k/255M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [8]:
# Fine-tune model
trainer.train()



[34m[1mwandb[0m: Currently logged in as: [33mchristian-159[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
1,1.1131,1.101754,0.319749,0.170074
2,1.0966,1.103503,0.315047,0.157301
3,1.1089,1.101326,0.310345,0.179635
4,1.0957,1.099854,0.327586,0.253234
5,1.0908,1.098841,0.341693,0.280658
6,1.0983,1.09803,0.34953,0.320229
7,1.1011,1.097358,0.352665,0.329133
8,1.0959,1.097089,0.346395,0.305275
9,1.0956,1.097159,0.351097,0.297
10,1.1066,1.097234,0.352665,0.289435


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

/content/distilBERT-on-market is already a clone of https://huggingface.co/c-b123/distilBERT-on-market. Make sure you pull the latest changes with `repo.git_pull()`.


Upload file pytorch_model.bin:   0%|          | 1.00/255M [00:00<?, ?B/s]

To https://huggingface.co/c-b123/distilBERT-on-market
   7ff3f2f..9c97728  main -> main

   7ff3f2f..9c97728  main -> main

To https://huggingface.co/c-b123/distilBERT-on-market
   9c97728..fb0d5fa  main -> main

   9c97728..fb0d5fa  main -> main



TrainOutput(global_step=80, training_loss=1.0572890311479568, metrics={'train_runtime': 1130.1475, 'train_samples_per_second': 9.061, 'train_steps_per_second': 0.071, 'total_flos': 501025474272960.0, 'train_loss': 1.0572890311479568, 'epoch': 4.0})

In [9]:
# Evaluate model
trainer.evaluate()

# Upload the model to the Hub
trainer.push_to_hub()

To https://huggingface.co/c-b123/distilBERT-on-market
   fb0d5fa..414c294  main -> main

   fb0d5fa..414c294  main -> main

