In [3]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate
    !pip install accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to ex1
    %cd /content/drive/MyDrive/LLM4behavior_workshop/ex1

# Processing data

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [4]:
# Reading in the .csv data
dat = pd.read_csv('health.csv')
dat # Inspecting the data

Unnamed: 0,text,labels
0,Broken leg. A broken leg (leg fracture) will b...,49.333333
1,Bulimia. Bulimia is an eating disorder and men...,34.181818
2,Hyperacusis. Hyperacusis is when everyday soun...,53.818182
3,DVT. DVT (deep vein thrombosis) is a blood clo...,12.800000
4,Ectopic pregnancy. An ectopic pregnancy is whe...,31.700000
...,...,...
772,Typhoid fever. Typhoid fever is a bacterial in...,27.900000
773,Ankylosing spondylitis. Ankylosing spondylitis...,30.800000
774,Sleepwalking. Sleepwalking is when someone wal...,71.181818
775,Fits. If you see someone having a seizure or f...,34.111111


In [5]:
# Convert pandas dataframe to HF Dataset
dat = Dataset.from_pandas(dat)
dat

Dataset({
    features: ['text', 'labels'],
    num_rows: 777
})

In [6]:
dat[0]

{'text': 'Broken leg. A broken leg (leg fracture) will be severely painful and may be swollen or bruised. You usually will not be able to walk on it.If it\'s a severe fracture, the leg may be an odd shape and the bone may even be poking out of the skin. There may have been a "crack" sound when the leg was broken, and the shock and pain of breaking your leg may cause you to feel faint, dizzy or sick.',
 'labels': 49.33333333}

In [7]:
# Defining model checkpoint
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

Vocabulary size: 30522, max context length: 512


In [8]:
# Function to tokenize a batch of samples
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding="max_length", truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

{'text': 'Broken leg. A broken leg (leg fracture) will be severely painful and may be swollen or bruised. You usually will not be able to walk on it.If it\'s a severe fracture, the leg may be an odd shape and the bone may even be poking out of the skin. There may have been a "crack" sound when the leg was broken, and the shock and pain of breaking your leg may cause you to feel faint, dizzy or sick.',
 'labels': 49.33333333,
 'input_ids': [101,
  3714,
  4190,
  1012,
  1037,
  3714,
  4190,
  1006,
  4190,
  19583,
  1007,
  2097,
  2022,
  8949,
  9145,
  1998,
  2089,
  2022,
  13408,
  2030,
  18618,
  1012,
  2017,
  2788,
  2097,
  2025,
  2022,
  2583,
  2000,
  3328,
  2006,
  2009,
  1012,
  2065,
  2009,
  1005,
  1055,
  1037,
  5729,
  19583,
  1010,
  1996,
  4190,
  2089,
  2022,
  2019,
  5976,
  4338,
  1998,
  1996,
  5923,
  2089,
  2130,
  2022,
  21603,
  2041,
  1997,
  1996,
  3096,
  1012,
  2045,
  2089,
  2031,
  2042,
  1037,
  1000,
  8579,
  1000,
  2614,
  

In [9]:
# Setting to torch format for input to model
dat.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dat

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 777
})

# Loading the model for feature extraction

In [10]:
import torch
torch.manual_seed(42) # For reproducibility
from transformers import AutoModel

In [11]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

device(type='mps')

In [13]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = AutoModel.from_pretrained(model_ckpt).to(device)
f'Model inputs: {tokenizer.model_input_names}'

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


"Model inputs: ['input_ids', 'attention_mask']"

In [14]:
def extract_features(batch):
    # Each batch is a dictionary with keys corresponding to the feature names. We only need the input ids and attention masks
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}

    # Tell torch not to build the computation graph during inference with `torch.no_grad()`
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state # Extract last hidden states

    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

# Extracting features. Features are extracted in batches of 8 samples to avoid running out of memory.
dat = dat.map(extract_features, batched=True, batch_size=8)
dat['hidden_state'].shape

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

torch.Size([777, 768])

# Predicting health perception with extracted features

In [15]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

In [16]:
features = pd.DataFrame(dat['hidden_state'])
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.148475,-0.093959,-0.072606,-0.116719,-0.197698,-0.017892,0.583106,0.199220,-0.294927,-0.524650,...,0.132813,-0.284638,0.341142,-0.053502,0.222674,0.264824,-0.127856,0.016527,-0.165402,0.297364
1,-0.188410,-0.126732,-0.108507,-0.293917,-0.218685,0.010479,0.353766,0.414596,-0.126797,-0.443795,...,-0.127295,-0.445322,0.027356,-0.416753,-0.055470,0.358791,-0.196915,-0.088872,-0.025871,0.366596
2,-0.171668,-0.050836,-0.194084,-0.312468,-0.174100,-0.011414,0.494835,0.385607,-0.115327,-0.769145,...,0.038155,-0.398147,0.033521,-0.147881,0.203515,0.511178,-0.293286,-0.232812,-0.022038,0.263111
3,-0.338618,-0.057856,0.033038,-0.357180,-0.202168,0.036766,0.324142,0.252396,-0.024016,-0.509216,...,0.136020,-0.368171,0.198924,-0.247393,0.010923,0.367897,-0.227436,-0.007300,0.000287,0.664291
4,-0.098636,-0.401457,-0.182545,-0.179665,0.064003,0.001181,0.291596,0.541527,-0.020205,-0.360373,...,-0.063380,-0.327838,0.088122,-0.291927,-0.019845,0.092592,0.024323,-0.213633,0.043224,0.738061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,-0.062923,-0.015915,-0.222807,-0.208136,-0.150941,-0.284561,0.377012,0.315895,-0.106533,-0.588246,...,0.030073,-0.439836,0.044907,-0.227704,-0.230707,0.228135,-0.225799,0.025129,0.063843,0.814006
773,-0.279977,-0.197014,0.098390,-0.303245,-0.296704,0.008300,0.288116,0.194376,-0.128856,-0.353388,...,-0.026887,-0.458937,0.206063,-0.471932,0.051058,0.154827,-0.420223,0.196191,0.028353,0.592685
774,-0.073572,-0.369269,0.076614,-0.097476,-0.064454,-0.054622,0.647633,0.325533,-0.363346,-0.434460,...,-0.008730,-0.349660,0.011828,-0.398827,-0.018842,0.212129,0.035592,-0.216679,-0.086199,0.413299
775,-0.181972,-0.083371,0.113803,-0.065226,0.033920,-0.130653,0.366495,0.114534,-0.174858,-0.477767,...,-0.016639,-0.359916,0.018970,-0.175317,0.033933,0.393584,-0.225063,-0.109431,0.165128,0.303420


In [17]:
# Initializing ridge regression 
regr = RidgeCV()

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, dat['labels'], test_size=.2, random_state=42)
f'Train size: {len(X_train)}, test size: {len(X_test)}'

'Train size: 621, test size: 156'

In [18]:
# Fitting the model and evaluating performance
regr.fit(X_train, y_train)
f'Test R2 = {regr.score(X_test, y_test).round(2)}'

'Test R2 = 0.54'

# Predicting health perception with fine-tuning

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [13]:
# Splitting the data into train and test sets
dat = dat.train_test_split(test_size=.2, seed=42)
dat

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 621
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 156
    })
})

In [15]:
type(dat['train'])

datasets.arrow_dataset.Dataset

In [21]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=1) # num_labels=1 for regression
         .to(device))

model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Setting up training arguments for the trainer
model_name = f"{model_ckpt}-finetuned-health"
batch_size = 8
training_args = TrainingArguments(
    output_dir=model_name,  # output directory to save training checkpoints
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch", # log training metrics at every epoch
    evaluation_strategy="epoch", # evaluate at the end of every epoch
    num_train_epochs=10, # number of times to iterate over the training data
)


def compute_metrics(eval_preds):
    """Computes the coefficient of determination (R2) on the test set"""
    metric = evaluate.load("r_squared")
    preds, labels = eval_preds
    return {"r_squared": metric.compute(predictions=preds, references=labels)}


# Instantiating the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dat['train'],
    eval_dataset=dat['test'],
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()

