In [13]:
from datasets import load_dataset, DatasetDict, load_metric
import pandas as pd
import numpy as np
import os
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification
from transformers import pipeline, Trainer
import pickle
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline(task='sentiment-analysis',model=model,tokenizer=tokenizer)
max_tokens = int(tokenizer.model_max_length)

In [3]:
#LOADING IN CUSTOM DATASET TO FINE TUNE ACCURACY OF MODEL
#needs verification whether it fits/work with the current model as the labels for the 2 models (pos, neg) are different
#
dataset = load_dataset(
   'financial_phrasebank', 'sentences_66agree')

#modifying dataset to fit format of pre-trained model
custom_data = dataset['train']
custom_data = custom_data.filter(lambda example: example['label'] % 2 == 0)

#more modification
def add_prefix(example):
    if example['label'] == 2:
        example['label'] = example['label'] - 1
    return example

#mapping in changes to dataset - 0 is negative, 1 is positive
custom_data = custom_data.map(add_prefix)
#print(custom_data['label'])
        
#tokenizer for processing data
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

#mapping and splitting up custom data into training and testing
tokenized_datasets = custom_data.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)

valid_set = DatasetDict({
    'train': tokenized_datasets['train'],
    'test': tokenized_datasets['test']})
#print(valid_set)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#error handling
columns_to_return = ['input_ids', 'label', 'attention_mask']
valid_set.set_format(type='torch', columns=columns_to_return)

#print(valid_set)
#train model with new data and report accuracy
training_args = TrainingArguments("test-trainer")
metric = load_metric("accuracy")

Reusing dataset financial_phrasebank (C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0\cache-4e0666b22dba4755.arrow
Loading cached processed dataset at C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0\cache-a1ade6443fd22d2a.arrow
Loading cached processed dataset at C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0\cache-49c96cd7749fcc96.arrow
Loading cached split indices for dataset at C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092bead39deb08fbf4bffd7c0a6991febbf0\cache-edbe7759452b8a26.arrow and C:\Users\Tony\.cache\huggingface\datasets\financial_phrasebank\sentences_66agree\1.0.0\a6d468761d4e0c8ae215c77367e1092be

In [4]:
#accuracy metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

In [9]:
training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    seed=0,
    load_best_model_at_end=True,)

In [10]:
#fine tuning the model with custom dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=valid_set['train'],
    eval_dataset=valid_set['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

#check in future to see if this is feasible and makes sense with the current pre-trained model

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 1513
  Num Epochs = 8
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 760


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.0571,0.164709,0.970414,0.982906,0.974576,0.978723


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence.
***** Running Evaluation *****
  Num examples = 169
  Batch size = 8
Saving model checkpoint to ./output\checkpoint-500
Configuration saved in ./output\checkpoint-500\config.json
Model weights saved in ./output\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./output\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./output\checkpoint-500\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./output\checkpoint-500 (score: 0.16470938920974731).


TrainOutput(global_step=760, training_loss=0.03764423972093745, metrics={'train_runtime': 1771.7222, 'train_samples_per_second': 6.832, 'train_steps_per_second': 0.429, 'total_flos': 250528967708160.0, 'train_loss': 0.03764423972093745, 'epoch': 8.0})

In [11]:
#this saves locally, the models can be very large. It is recommended that you push them to huggingface's model hub
trainer.save_model('./model/')

Saving model checkpoint to ./model/
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json
