Copyright 2025 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and limitations under the License.

In [None]:
# system stuff
import sys
import os

# the usual
import pandas as pd

# model stuff 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline

# extra scores
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score

# my stuff (abstracted non-important functions)
# Get the project root (one level up from notebooks)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.prepare_data import get_labels_list, create_train_test_dataframes, tokenize_data, create_hf_dataset
from src.evaluate_data import compute_metrics, classify_batch, classify_batch_llm, all_right, right_plus_extra, added_one, missed_one
from src.config import data_path_rvm, out_folder

In [None]:
# how many cores do I have to play with?
os.cpu_count()

In [None]:
import psutil

ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")
print(f"Available RAM: {ram.available / 1e9:.2f} GB")

## Read in Data and Preprocess

* Read in data
* Create a column for use in training
* Create a small 'training' set to mimic a human categorizing a small subset of survey responses
* Convert to hugging face datasets for smoother processing
* Create lists of labels to use in various models

In [None]:
df = pd.read_excel(data_path_rvm, sheet_name = 'Q07a')
df = df.iloc[:, 4:-1]
df.columns = ['Response'] + list(df.columns[1:])
labels_original = list(df.columns)[1:]
df.columns = [x.lower().replace(' ','_').replace('/','_').replace(':','_') for x in df.columns]

In [None]:
# convert NANs to 0s and 'X' to 1, and add 'Other' column incase there are no X's
df.iloc[:, 1:] = df.iloc[:, 1:].map(lambda x: 1 if x=='X' else 0)
df['other'] = df.apply(lambda row: 0 if any(row==1) else 1, axis=1)
df.head()

In [None]:
# remove any NA responses
df = df[~pd.isna(df.response)].reset_index(drop=True)

In [None]:
# we'll use the original labels for the LLM classifier, but mostly we want nice to work with column names 
labels_original = labels_original + ['Other']
labels_original

In [None]:
df.head()

In [None]:
# convert to a multi labeled dataset for use with huggingface
df['label'] = df.apply(get_labels_list, axis=1)

In [None]:
# create list of categories
labels = list(df.columns[1:-1])
display(labels)

# create id2labels to use in the modelling step
id2label = {ii: label for ii, label in enumerate(labels)}
label2id = {label: idx for idx, label in id2label.items()}

In [None]:
df.head()

In [None]:
# let's chunk off the majority of this data to never be used in training
# this most matches what we would like our process to look like
n_train = 1_000
df_train, df_test = create_train_test_dataframes(df, n_train=n_train)

In [None]:
# use distillbert to tokenize the data (a smaller version of bert)
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
# convert from pandas to a huggine face dataset to best utilize their tools
# this will also tokenize the data
dataset_train = create_hf_dataset(df_train, tokenizer)
dataset_test = create_hf_dataset(df_test, tokenizer)
dataset = create_hf_dataset(df, tokenizer)

In [None]:
# train test split the data for model validation
# the 'test' data from this step will be used as validation data in the modelling step
dataset_train = dataset_train.train_test_split(test_size = 0.3)

In [None]:
dataset

## Set up a model with pre-trained data

* Using hugging face open source models as our starting point
* Set up a trainer where we can play with the number of devices to use during training
* Train the model
* Evaluate accuracy (and test how long this step takes)
* Save the model
* Classify our entire dataset (and test how long this step takes)

In [None]:
# set up a model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", 
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)

In [None]:
# set up trainer
model_path = '../models/my_test_model_rvm'
training_args = TrainingArguments(
    
    # save params
    output_dir=model_path, # save final model
    eval_strategy="epoch",        # make evaluations at end of each epoch
    save_strategy='epoch',        # save checkpoints every epoch
    load_best_model_at_end=True,   # save best model at end
    
    # learning params
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    
    # calibrate machine params 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    
    # logs
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics # computes accuracy and f1 score
)
    

In [None]:
# train our model
trainer.train()

In [None]:
# save (if didn't earlier)
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# # note: you can reload this model either from a checkpoint or from the final saved model
# # to continue training (back into a new Trainer() instance)

# # examples:

# # 1. CHECKPOINT 
# # load from checkpoint
# checkpoint = f'{model_path}/checkpoint-500'
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# # re-initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args, # same as above
#     train_dataset=dataset['train'],
#     eval_dataset=dataset['test']
# )

# # resume - this preservers optimizer states, learning rate scheduler, and epoch counters
# trainer.train(resume_from_checkpoint=True)

# # 2. NEW TRAINING FROM SAVED MODEL
# # Load the saved model (not a checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # New training arguments (can modify as needed)
# new_training_args = TrainingArguments(
#     output_dir="./continued_training",
#     # update save params, learning hyperparams, machine params, log params as you want here
# )

# # Re-initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=new_training_args,
#     train_dataset=new_train_dataset,  # Can be the same or new dataset
#     eval_dataset=new_val_dataset
# )

# # Start training from the loaded model
# trainer.train()


In [None]:
# evaluate
results_train = trainer.evaluate(eval_dataset=dataset_train)
results_test = trainer.evaluate(eval_dataset=dataset_test)
print(results_train)
print(results_test)

In [None]:
# inference
my_classifier = pipeline(
    'text-classification', 
    model=model_path, 
    top_k=None, # needed for multi label 
    device=-1    # will use a GPU if available and set to 0
)

# some examples
text = 'Parking is too expensive'
#text = "I don't live here"
#text = 'The ferry from vancouver is too long'
#text = "I just don't like the museum"

out = my_classifier(text)
out

In [None]:
# size of model
n_params = my_classifier.model.num_parameters()
n_mb = n_params * 4 / (1024**2)
print(f'Params:    {n_params:,}\nAppx Size: {n_mb}')

In [None]:
df.head()

In [None]:
# classify across the entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use 

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch,
    fn_kwargs={'classifier': my_classifier, 'threshold': threshold, 'suffix':'_bert'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

In [None]:
# back to dataframe for later analysis
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)

In [None]:
df_results = df.merge(results_df, left_index=True, right_index=True)
df_results.head()

## Set up a model with NO pre-trained data

* Uses an LLM to classify data into a given set of categories
* No pre-training required (but probably means worse outputs)
* When setting up the pipeline can play with device value I think to make it faster/optimize our space a bit better

In [None]:
# next build in the facebook method 
labels_original

# zero shot classifier for non-trained data
llm_classifier = pipeline(
    'zero-shot-classification', 
    model='facebook/bart-large-mnli', 
    num_workers=1, # for debugging crashing in the DSVM (disables multiprocessing)
    device=-1 # 0 for GPUs I think?
)

# some examples
#text = 'Parking is too expensive'
#text = "I don't live here"
#text = 'The ferry from vancouver is too long'
text = "I just don't like the museum"

out = llm_classifier(
    text, 
    # extra inputs for the LLM classifier
    labels_original, 
    multi_label=True
)
out

In [None]:
# size of model
n_params = llm_classifier.model.num_parameters()
n_mb = n_params * 4 / (1024**2)
print(f'Params:    {n_params:,}\nAppx Size: {n_mb}')

In [None]:
# do across entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use (works on workstation)
#num_proc = 1 # crashes for the basic DSVM, so testing different options

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch_llm,
    fn_kwargs={'classifier': llm_classifier, 'labels': labels_original, 'threshold': threshold, 'suffix':'_llm'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

In [None]:
# back to dataframe for later analysis
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)
results_df.columns = [x.lower().replace(' ','_').replace('/','_').replace(':','_') for x in results_df.columns]
results_df.head()

In [None]:
df_results_complete = df_results.merge(results_df, left_index=True, right_index=True)
df_results_complete.head()

## Final Comparison of Outputs

* Look at how well the model did at:
  * Getting the outputs exactly matched to the initial data
  * Getting the outputs exactly matched, but added in an extra category
  * Adding at least one category that wasn't in initial data
  * Missing at least one category that wasn't in initial data

* More technical scores:
  * Precision
  * Recall
  * F1 Score

These last 3 are calculated at the micro level, to align with the language studio scoring methods.
See [sklearn](https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html) for more details.

In [None]:
labels
labels_bert = [x+'_bert' for x in labels]

df_results_complete['all_right_bert'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_bert))
df_results_complete['right_plus_extra_bert'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_bert))
df_results_complete['added_one_bert'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_bert))
df_results_complete['missed_one_bert'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_bert))

labels_llm = [x+'_llm' for x in labels]
df_results_complete['all_right_llm'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_llm))
df_results_complete['right_plus_extra_llm'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_llm))
df_results_complete['added_one_llm'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_llm))
df_results_complete['missed_one_llm'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_llm))

In [None]:
df_summary = df_results_complete[
    [
        'all_right_bert', 'right_plus_extra_bert', 'added_one_bert', 'missed_one_bert',
        'all_right_llm',  'right_plus_extra_llm',  'added_one_llm',  'missed_one_llm'
    ]
].sum().reset_index()
df_summary.columns = ['metric', 'n']
df_summary['pct'] = df_summary['n']/len(df_results_complete)
df_summary

In [None]:
# add precision, recall, f1 scores
y = df_results_complete[labels].values.astype('float')
y_bert = df_results_complete[labels_bert].values.astype('float')
y_llm = df_results_complete[labels_llm].values.astype('float')

extra = pd.DataFrame(
    {'metric': ['f1_bert', 'prec_bert', 'recall_bert', 'f1_llm', 'prec_llm', 'recall_llm'], 
     'n': [len(df_results_complete)]*6, 
     'pct': [
         f1_score(y, y_bert, average='micro'),
         precision_score(y, y_bert, average='micro'),
         recall_score(y, y_bert, average='micro'),
         f1_score(y, y_llm, average='micro'),
         precision_score(y, y_llm, average='micro'),
         recall_score(y, y_llm, average='micro')
     ]
    }
     )

df_summary = pd.concat([df_summary, extra])
df_summary


In [None]:
df_summary

In [None]:
# get same results but for test data
tmp = df_test.copy()
tmp['test'] = 1
df_results_complete = df_results_complete.merge(tmp[['test']], right_index=True, left_index=True, how='left')
df_results_complete['test'] = df_results_complete['test'].apply(lambda x: 0 if pd.isna(x) else x)
df_results_complete.head()

In [None]:
df_summary_test = df_results_complete[df_results_complete['test']==1][
    [
        'all_right_bert', 'right_plus_extra_bert', 'added_one_bert', 'missed_one_bert',
        'all_right_llm',  'right_plus_extra_llm',  'added_one_llm',  'missed_one_llm'
    ]
].sum().reset_index()
df_summary_test.columns = ['metric', 'n']
df_summary_test['pct'] = df_summary_test['n']/(df_results_complete['test'].sum())
df_summary_test

In [None]:
# prec/f1/recall for test data
# add precision, recall, f1 scores
y = df_results_complete[df_results_complete['test']==1][labels].values.astype('float')
y_bert = df_results_complete[df_results_complete['test']==1][labels_bert].values.astype('float')
y_llm = df_results_complete[df_results_complete['test']==1][labels_llm].values.astype('float')

extra = pd.DataFrame(
    {'metric': ['f1_bert', 'prec_bert', 'recall_bert', 'f1_llm', 'prec_llm', 'recall_llm'], 
     'n': [len(df_results_complete)]*6, 
     'pct': [
         f1_score(y, y_bert, average='micro'),
         precision_score(y, y_bert, average='micro'),
         recall_score(y, y_bert, average='micro'),
         f1_score(y, y_llm, average='micro'),
         precision_score(y, y_llm, average='micro'),
         recall_score(y, y_llm, average='micro')
     ]
    }
     )

df_summary_test = pd.concat([df_summary_test, extra])
df_summary_test

In [None]:
# easy to read results 
def create_legible(row, labels):
    options = row[labels]
    # find where true
    out = ', '.join([x.removesuffix('_bert').removesuffix('_llm') for x in options[options==1].index])
    if len(out)==0:
        out = "Other"
    return out

df_results_complete['actual'] = df_results_complete.apply(create_legible, axis=1, args=(labels,))
df_results_complete['bert'] = df_results_complete.apply(create_legible, axis=1, args=(labels_bert,))
df_results_complete['llm'] = df_results_complete.apply(create_legible, axis=1, args=(labels_llm,))
df_results_complete = df_results_complete[['response', 'actual', 'bert', 'llm'] + list(df_results_complete.columns[1:-3])]

df_results_complete.head()

In [None]:
# save results and summary for later consumption
df_results_complete.to_csv(out_folder+'/rbcm_q7_results.csv', index=False)
df_summary.to_csv(out_folder+'/rbcm_q7_summary.csv', index=False)
df_summary_test.to_csv(out_folder+'/rbcm_q7_summary_test.csv', index=False)