In [10]:
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_model, LoraConfig
import evaluate
import pandas as pd
from datasets import Dataset
import torch

In [3]:
# Reading in the .csv data
dat = pd.read_csv('health.csv')
dat # Inspecting the data

Unnamed: 0,text,labels
0,Broken leg. A broken leg (leg fracture) will b...,49.333333
1,Bulimia. Bulimia is an eating disorder and men...,34.181818
2,Hyperacusis. Hyperacusis is when everyday soun...,53.818182
3,DVT. DVT (deep vein thrombosis) is a blood clo...,12.800000
4,Ectopic pregnancy. An ectopic pregnancy is whe...,31.700000
...,...,...
772,Typhoid fever. Typhoid fever is a bacterial in...,27.900000
773,Ankylosing spondylitis. Ankylosing spondylitis...,30.800000
774,Sleepwalking. Sleepwalking is when someone wal...,71.181818
775,Fits. If you see someone having a seizure or f...,34.111111


In [6]:
# Convert pandas dataframe to HF Dataset
dat = Dataset.from_pandas(dat)
dat

Dataset({
    features: ['text', 'labels'],
    num_rows: 777
})

In [7]:
# Defining model checkpoint
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [8]:
# Function to tokenize a batch of samples
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding="max_length", truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

{'text': 'Broken leg. A broken leg (leg fracture) will be severely painful and may be swollen or bruised. You usually will not be able to walk on it.If it\'s a severe fracture, the leg may be an odd shape and the bone may even be poking out of the skin. There may have been a "crack" sound when the leg was broken, and the shock and pain of breaking your leg may cause you to feel faint, dizzy or sick.',
 'labels': 49.33333333,
 'input_ids': [101,
  3714,
  4190,
  1012,
  1037,
  3714,
  4190,
  1006,
  4190,
  19583,
  1007,
  2097,
  2022,
  8949,
  9145,
  1998,
  2089,
  2022,
  13408,
  2030,
  18618,
  1012,
  2017,
  2788,
  2097,
  2025,
  2022,
  2583,
  2000,
  3328,
  2006,
  2009,
  1012,
  2065,
  2009,
  1005,
  1055,
  1037,
  5729,
  19583,
  1010,
  1996,
  4190,
  2089,
  2022,
  2019,
  5976,
  4338,
  1998,
  1996,
  5923,
  2089,
  2130,
  2022,
  21603,
  2041,
  1997,
  1996,
  3096,
  1012,
  2045,
  2089,
  2031,
  2042,
  1037,
  1000,
  8579,
  1000,
  2614,
  

In [9]:
# Setting to torch format for input to model
dat.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dat

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 777
})

In [11]:
model_ckpt = 'distilbert-base-uncased'

# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [19]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=1) # num_labels=1 for regression
         .to(device))

peft_config = LoraConfig(
    task_type='SEQ_CLS', inference_mode=False, 
    target_modules=['q_lin', 'k_lin', 'v_lin', 'out_lin', 'lin1', 'lin2']
)
peft_model = get_peft_model(model, peft_config)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7b9b2f72-0b36-4140-90d5-01a6ae5230d5)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-uncased/resolve/main/config.json
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

In [21]:
# Splitting the data into train and test sets
dat = dat.train_test_split(test_size=.2, seed=42)
dat

# Setting up training arguments for the trainer
model_name = f"{model_ckpt}-finetuned-health"
batch_size = 8
training_args = TrainingArguments(
    output_dir=model_name,  # output directory to save training checkpoints
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch", # log training metrics at every epoch
    evaluation_strategy="epoch", # evaluate at the end of every epoch
    num_train_epochs=10, # number of times to iterate over the training data
)


def compute_metrics(eval_preds):
    """Computes the coefficient of determination (R2) on the test set"""
    metric = evaluate.load("r_squared")
    preds, labels = eval_preds
    return {"r_squared": metric.compute(predictions=preds, references=labels)}


# Instantiating the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dat['train'],
    eval_dataset=dat['test'],
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 