In [None]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to health
    %cd /content/drive/MyDrive/LLM4BeSci/health

## Processing data

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
# Reading in the .csv data
dat = pd.read_csv('health.csv')
dat 

In [None]:
# Convert pandas dataframe to HF Dataset
dat = Dataset.from_pandas(dat)
dat

In [None]:
dat[0]

In [None]:
# Defining model checkpoint
model_ckpt = 'distilbert-base-uncased'

# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

print(f'Vocabulary size: {tokenizer.vocab_size}, max context length: {tokenizer.model_max_length}')

In [None]:
# Function to tokenize a batch of samples
batch_tokenizer = lambda batch: tokenizer(batch['text'], padding=True, truncation=True)

#  Tokenizing the dataset
dat = dat.map(batch_tokenizer, batched=True)
dat[0]

In [None]:
# Setting to torch format for input to model
dat.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dat

# Loading the model for feature extraction

In [None]:
import torch
torch.manual_seed(42) # For reproducibility
from transformers import AutoModel

In [None]:
# Loading the model and moving it to the GPU if available
if torch.cuda.is_available():  # for nvidia GPUs
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # for Apple Metal Performance Sharder (mps) GPUs
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device

In [None]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = AutoModel.from_pretrained(model_ckpt).to(device)
f'Model inputs: {tokenizer.model_input_names}'

In [None]:
def extract_features(batch):
    # Each batch is a dictionary with keys corresponding to the feature names. We only need the input ids and attention masks
    inputs = {k:v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}

    # Tell torch not to build the computation graph during inference with `torch.no_grad()`
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state # Extract last hidden states

    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

# Extracting features. Features are extracted in batches of 8 samples to avoid running out of memory.
dat = dat.map(extract_features, batched=True, batch_size=8)
dat['hidden_state'].shape

# Predicting health perception with extracted features

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split

In [None]:
# converting to pandas dataframe
features = pd.DataFrame(dat['hidden_state'])
features

In [None]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, dat['labels'], test_size=.2, random_state=42)
f'Train size: {len(X_train)}, test size: {len(X_test)}'

In [None]:
# Scaling the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Initializing ridge regression 
ridge = RidgeCV(alphas=[10 ** i for i in range(-5, 7)])

# Fitting the model and evaluating performance
ridge.fit(X_train, y_train)
f'Test R2 = {ridge.score(X_test, y_test).round(2)}'

# Predicting health perception with fine-tuning

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [None]:
# Splitting the data into train and test sets
dat = dat.train_test_split(test_size=.2, seed=42)
dat

In [None]:
type(dat['train'])

In [None]:
# Loading distilbert-base-uncased and moving it to the GPU if available
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=1) # num_labels=1 for regression
         .to(device))

model

In [None]:
# Setting up training arguments for the trainer
model_name = f"{model_ckpt}-finetuned-health"
batch_size = 8
training_args = TrainingArguments(
    output_dir=model_name,  # output directory to save training checkpoints
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch", # log training metrics at every epoch
    evaluation_strategy="epoch", # evaluate at the end of every epoch
    num_train_epochs=10, # number of times to iterate over the training data
    optim='adamw_torch', # optimizer to use
)


def compute_metrics(eval_preds):
    """Computes the coefficient of determination (R2) on the test set"""
    metric = evaluate.load("r_squared")
    preds, labels = eval_preds
    return {"r_squared": metric.compute(predictions=preds, references=labels)}


# Instantiating the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dat['train'],
    eval_dataset=dat['test'],
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()