In [1]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to health
    %cd /content/drive/MyDrive/LLM4BeSci/health

## Processing data

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [3]:
# Reading in the .csv data
dat = pd.read_csv('health.csv')
dat 

Unnamed: 0,text,labels
0,Broken leg. A broken leg (leg fracture) will b...,49.333333
1,Bulimia. Bulimia is an eating disorder and men...,34.181818
2,Hyperacusis. Hyperacusis is when everyday soun...,53.818182
3,DVT. DVT (deep vein thrombosis) is a blood clo...,12.800000
4,Ectopic pregnancy. An ectopic pregnancy is whe...,31.700000
...,...,...
772,Typhoid fever. Typhoid fever is a bacterial in...,27.900000
773,Ankylosing spondylitis. Ankylosing spondylitis...,30.800000
774,Sleepwalking. Sleepwalking is when someone wal...,71.181818
775,Fits. If you see someone having a seizure or f...,34.111111


In [4]:
# Convert pandas dataframe to HF Dataset
dat = Dataset.from_pandas(dat)
dat

Dataset({
    features: ['text', 'labels'],
    num_rows: 777
})

In [5]:
dat[0]

{'text': 'Broken leg. A broken leg (leg fracture) will be severely painful and may be swollen or bruised. You usually will not be able to walk on it.If it\'s a severe fracture, the leg may be an odd shape and the bone may even be poking out of the skin. There may have been a "crack" sound when the leg was broken, and the shock and pain of breaking your leg may cause you to feel faint, dizzy or sick.',
 'labels': 49.33333333}

In [6]:
# Defining model checkpoint
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [7]:
# Function to tokenize a batch of samples
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding=True, truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

{'text': 'Broken leg. A broken leg (leg fracture) will be severely painful and may be swollen or bruised. You usually will not be able to walk on it.If it\'s a severe fracture, the leg may be an odd shape and the bone may even be poking out of the skin. There may have been a "crack" sound when the leg was broken, and the shock and pain of breaking your leg may cause you to feel faint, dizzy or sick.',
 'labels': 49.33333333,
 'input_ids': [101,
  3714,
  4190,
  1012,
  1037,
  3714,
  4190,
  1006,
  4190,
  19583,
  1007,
  2097,
  2022,
  8949,
  9145,
  1998,
  2089,
  2022,
  13408,
  2030,
  18618,
  1012,
  2017,
  2788,
  2097,
  2025,
  2022,
  2583,
  2000,
  3328,
  2006,
  2009,
  1012,
  2065,
  2009,
  1005,
  1055,
  1037,
  5729,
  19583,
  1010,
  1996,
  4190,
  2089,
  2022,
  2019,
  5976,
  4338,
  1998,
  1996,
  5923,
  2089,
  2130,
  2022,
  21603,
  2041,
  1997,
  1996,
  3096,
  1012,
  2045,
  2089,
  2031,
  2042,
  1037,
  1000,
  8579,
  1000,
  2614,
  

In [8]:
# Setting to torch format for input to model
dat.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dat

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 777
})

# Loading the model for feature extraction

In [9]:
import torch
torch.manual_seed(42) # For reproducibility
from transformers import AutoModel

In [10]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [11]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = AutoModel.from_pretrained(model_ckpt).to(device)
f'Model inputs: {tokenizer.model_input_names}'

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"Model inputs: ['input_ids', 'attention_mask']"

In [None]:
def extract_features(batch):
    # Each batch is a dictionary with keys corresponding to the feature names. We only need the input ids and attention masks
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}

    # Tell torch not to build the computation graph during inference with `torch.no_grad()`
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state # Extract last hidden states

    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

# Extracting features. Features are extracted in batches of 8 samples to avoid running out of memory.
dat = dat.map(extract_features, batched=True, batch_size=8)
dat['hidden_state'].shape

# Predicting health perception with extracted features

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

In [None]:
# converting to pandas dataframe
features = pd.DataFrame(dat['hidden_state'])
features

In [None]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, dat['labels'], test_size=.2, random_state=42)
f'Train size: {len(X_train)}, test size: {len(X_test)}'

In [None]:
# Scaling the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Initializing ridge regression 
ridge = RidgeCV(alphas=[10 ** i for i in range(-5, 7)])

# Fitting the model and evaluating performance
ridge.fit(X_train, y_train)
f'Test R2 = {ridge.score(X_test, y_test).round(2)}'

# Predicting health perception with fine-tuning

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
# Splitting the data into train and test sets
dat = dat.train_test_split(test_size=.2, seed=42)
dat

In [None]:
type(dat['train'])

In [None]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=1) # num_labels=1 for regression
         .to(device))

model

In [None]:
# Setting up training arguments for the trainer
model_name = f"{model_ckpt}-finetuned-health"
batch_size = 8
training_args = TrainingArguments(
    output_dir=model_name,  # output directory to save training checkpoints
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch", # log training metrics at every epoch
    evaluation_strategy="epoch", # evaluate at the end of every epoch
    num_train_epochs=10, # number of times to iterate over the training data
    optim='adamw_torch', # optimizer to use
)


def compute_metrics(eval_preds):
    """Computes the coefficient of determination (R2) on the test set"""
    metric = evaluate.load("r_squared")
    preds, labels = eval_preds
    return {"r_squared": metric.compute(predictions=preds, references=labels)}


# Instantiating the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dat['train'],
    eval_dataset=dat['test'],
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()