In [None]:
%%capture
! pip install transformers datasets evaluate
! pip install accelerate
! pip install --upgrade accelerate
! pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import evaluate
import numpy as np
import pandas as pd
from datasets import load_dataset, ClassLabel, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

from df_processing import add_relative_return_ordinal, balance_via_oversampling, balance_via_undersampling

df = pd.read_csv(r"/content/drive/MyDrive/masterProject/av_train.csv")

df = add_relative_return_ordinal(df, "sp_25_pct", "sp_75_pct")

df = balance_via_undersampling(df, "relative_return")

print(df.value_counts(subset="relative_return"))

# Load data
labels = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Load dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("relative_return", "label")
dataset = dataset.rename_column("summary", "text")

# Split into 80% training and 20% validation
dataset = dataset.train_test_split(train_size=0.8)

# Tokenize dataset using distilbert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    examples["label"] = labels.str2int(examples["label"])
    return tokenizer(examples["text"], padding=True, truncation=True)


tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

# Convert to PyTorch tensors for faster training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load model and specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", id2label=id2label,
                                                           label2id=label2id, num_labels=3)

# Set hyperparameters
training_args = TrainingArguments(
    output_dir="distilBERT_market_finetuning",
    learning_rate=3e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    weight_decay=0.5,
    evaluation_strategy="steps",
    push_to_hub=True,
    eval_steps=10,
    max_steps=200,
    save_steps=0,
    logging_steps=1,
    hub_strategy="end",
    hub_private_repo=True
)


# Evaluate model
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average='weighted')["f1"]

    return {"accuracy": accuracy, "f1": f1}


# Create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

relative_return
negative    883
neutral     883
positive    883
dtype: int64


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2119 [00:00<?, ? examples/s]

Map:   0%|          | 0/530 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/c-b123/distilBERT_market_finetuning into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/255M [00:00<?, ?B/s]

Download file runs/Jul28_13-50-18_6dce4a6b82a9/events.out.tfevents.1690552602.6dce4a6b82a9.380.3: 100%|#######…

Download file runs/Jul28_13-50-18_6dce4a6b82a9/events.out.tfevents.1690552224.6dce4a6b82a9.380.2:  51%|#####1 …

Clean file runs/Jul28_13-50-18_6dce4a6b82a9/events.out.tfevents.1690552602.6dce4a6b82a9.380.3: 100%|##########…

Download file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file runs/Jul28_13-50-18_6dce4a6b82a9/events.out.tfevents.1690552224.6dce4a6b82a9.380.2:   3%|3         …

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [None]:
# Fine-tune model
trainer.train()

# Evaluate model
trainer.evaluate()

# Upload the model to the Hub
trainer.push_to_hub()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
10,1.0944,1.097753,0.328302,0.193327
20,1.0748,1.090894,0.381132,0.31565
30,1.081,1.082554,0.390566,0.388976
40,1.0659,1.078584,0.407547,0.40679
50,1.024,1.077019,0.416981,0.375256
60,0.9653,1.085861,0.426415,0.425793
70,0.9507,1.084395,0.426415,0.418724
80,0.9474,1.089023,0.426415,0.426613
90,0.8516,1.107321,0.432075,0.424423
100,0.8052,1.124812,0.430189,0.426162


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 1.00/255M [00:00<?, ?B/s]

Upload file runs/Jul28_14-23-57_da45f4daef29/events.out.tfevents.1690554692.da45f4daef29.381.1:   0%|         …

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

Upload file runs/Jul28_14-23-57_da45f4daef29/events.out.tfevents.1690554318.da45f4daef29.381.0:   0%|         …

To https://huggingface.co/c-b123/distilBERT_market_finetuning
   e4b2454..8c89b48  main -> main

   e4b2454..8c89b48  main -> main

To https://huggingface.co/c-b123/distilBERT_market_finetuning
   8c89b48..2b9a390  main -> main

   8c89b48..2b9a390  main -> main



'https://huggingface.co/c-b123/distilBERT_market_finetuning/commit/8c89b48dbb4ae7aa68e63b24a2ca5f05ac764c31'