In [1]:
# system stuff
import sys
import os

# the usual
import pandas as pd

# model stuff 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline

# my stuff (abstracted non-important functions)
# Get the project root (one level up from notebooks)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.prepare_data import get_labels_list, create_train_test_dataframes, tokenize_data, create_hf_dataset
from src.evaluate_data import compute_metrics, classify_batch, classify_batch_llm, all_right, right_plus_extra, added_one, missed_one
from src.config import data_path

In [2]:
# how many cores do I have to play with?
os.cpu_count()

12

In [3]:
import psutil

ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")
print(f"Available RAM: {ram.available / 1e9:.2f} GB")

Total RAM: 34.07 GB
Available RAM: 15.30 GB


## Read in Data and Preprocess

* Read in data
* Create a column for use in training
* Create a small 'training' set to mimic a human categorizing a small subset of survey responses
* Convert to hugging face datasets for smoother processing
* Create lists of labels to use in various models

In [4]:
df = pd.read_csv(data_path)
labels_original = list(df.columns)[1:]
df.columns = [x.lower().replace(' ','_') for x in df.columns]

In [5]:
# we'll use the original labels for the LLM classifier, but mostly we want nice to work with column names 
labels_original

['Response',
 'Data Scientist',
 'Data Engineer',
 'Data Analyst',
 'ML Engineer',
 'Other']

In [6]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0
2,System Administrator,0,0,0,0,1
3,IT Support Specialist,0,0,0,0,1
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0


In [7]:
# convert to a multi labeled dataset for use with huggingface
df['label'] = df.apply(get_labels_list, axis=1)

In [8]:
# create list of categories
labels = list(df.columns[1:6])
display(labels)

# create id2labels to use in the modelling step
id2label = {ii: label for ii, label in enumerate(labels)}
label2id = {label: idx for idx, label in id2label.items()}

['data_scientist', 'data_engineer', 'data_analyst', 'ml_engineer', 'other']

In [9]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]"
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]"


In [10]:
# let's chunk off the majority of this data to never be used in training
# this most matches what we would like our process to look like
n_train = 1_000
df_train, df_test = create_train_test_dataframes(df, n_train=n_train)

In [11]:
# use distillbert to tokenize the data (a smaller version of bert)
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [12]:
# convert from pandas to a huggine face dataset to best utilize their tools
# this will also tokenize the data
dataset_train = create_hf_dataset(df_train, tokenizer)
dataset_test = create_hf_dataset(df_test, tokenizer)
dataset = create_hf_dataset(df, tokenizer)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [13]:
# train test split the data for model validation
# the 'test' data from this step will be used as validation data in the modelling step
dataset_train = dataset_train.train_test_split(test_size = 0.3)

In [14]:
dataset

Dataset({
    features: ['response', 'label', 'input_ids', 'attention_mask'],
    num_rows: 50000
})

## Set up a model with pre-trained data

* Using hugging face open source models as our starting point
* Set up a trainer where we can play with the number of devices to use during training
* Train the model
* Evaluate accuracy (and test how long this step takes)
* Save the model
* Classify our entire dataset (and test how long this step takes)

In [28]:
# set up a model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", 
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# set up trainer
model_path = '../models/my_test_model'
training_args = TrainingArguments(
    
    # save params
    output_dir=model_path, # save final model
    eval_strategy="epoch",        # make evaluations at end of each epoch
    save_strategy='epoch',        # save checkpoints every epoch
    load_best_model_at_end=True,   # save best model at end
    
    # learning params
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    
    # calibrate machine params 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    
    # logs
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics # computes accuracy and f1 score
)
    

In [30]:
# train our model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3811,0.330092,0.516667,0.8463
2,0.2097,0.188021,0.886667,0.963602
3,0.1535,0.1426,0.97,0.991288


TrainOutput(global_step=525, training_loss=0.32123843147641135, metrics={'train_runtime': 1507.6411, 'train_samples_per_second': 1.393, 'train_steps_per_second': 0.348, 'total_flos': 278196420096000.0, 'train_loss': 0.32123843147641135, 'epoch': 3.0})

In [31]:
# save (if didn't earlier)
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('../models/my_test_model\\tokenizer_config.json',
 '../models/my_test_model\\special_tokens_map.json',
 '../models/my_test_model\\vocab.txt',
 '../models/my_test_model\\added_tokens.json',
 '../models/my_test_model\\tokenizer.json')

In [None]:
# # note: you can reload this model either from a checkpoint or from the final saved model
# # to continue training (back into a new Trainer() instance)

# # examples:

# # 1. CHECKPOINT 
# # load from checkpoint
# checkpoint = f'{model_path}/checkpoint-500'
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# # re-initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args, # same as above
#     train_dataset=dataset['train'],
#     eval_dataset=dataset['test']
# )

# # resume - this preservers optimizer states, learning rate scheduler, and epoch counters
# trainer.train(resume_from_checkpoint=True)

# # 2. NEW TRAINING FROM SAVED MODEL
# # Load the saved model (not a checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # New training arguments (can modify as needed)
# new_training_args = TrainingArguments(
#     output_dir="./continued_training",
#     # update save params, learning hyperparams, machine params, log params as you want here
# )

# # Re-initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=new_training_args,
#     train_dataset=new_train_dataset,  # Can be the same or new dataset
#     eval_dataset=new_val_dataset
# )

# # Start training from the loaded model
# trainer.train()


In [32]:
# evaluate
results_train = trainer.evaluate(eval_dataset=dataset_train)
results_test = trainer.evaluate(eval_dataset=dataset_test)
print(results_train)
print(results_test)

{'eval_train_loss': 0.14308708906173706, 'eval_train_accuracy': 0.9742857142857143, 'eval_train_f1': 0.9921063564603241, 'eval_train_runtime': 93.8062, 'eval_train_samples_per_second': 7.462, 'eval_train_steps_per_second': 1.866, 'epoch': 3.0, 'eval_test_loss': 0.14260007441043854, 'eval_test_accuracy': 0.97, 'eval_test_f1': 0.9912875121006777, 'eval_test_runtime': 40.1353, 'eval_test_samples_per_second': 7.475, 'eval_test_steps_per_second': 1.869}
{'eval_loss': 0.14651787281036377, 'eval_accuracy': 0.9650816326530612, 'eval_f1': 0.9893526582684842, 'eval_runtime': 6775.7546, 'eval_samples_per_second': 7.232, 'eval_steps_per_second': 1.808, 'epoch': 3.0}


In [33]:
# inference
my_classifier = pipeline(
    'text-classification', 
    model=model_path, 
    top_k=None, # needed for multi label 
    device=-1    # will use a GPU if available and set to 0
)

# note this did poorly on 'not', but there are options for dealing with that
text = 'I am not a data scientist'

# directly trained on
text = 'ML Ops Specialist, ML Engineer'

# unrelated 
text = "I'm a librarian. Why'd I get this survey?"

out = my_classifier(text)
out

Device set to use cpu


[[{'label': 'other', 'score': 0.8851000070571899},
  {'label': 'data_scientist', 'score': 0.08201117068529129},
  {'label': 'ml_engineer', 'score': 0.050856105983257294},
  {'label': 'data_analyst', 'score': 0.04202040284872055},
  {'label': 'data_engineer', 'score': 0.03888040408492088}]]

In [34]:
# size of model
n_params = my_classifier.model.num_parameters()
n_mb = n_params * 4 / (1024**2)
print(f'Params:    {n_params:,}\nAppx Size: {n_mb}')

Params:    66,957,317
Appx Size: 255.42189407348633


In [35]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]"
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]"


In [36]:
# classify across the entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use 

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch,
    fn_kwargs={'classifier': my_classifier, 'threshold': threshold, 'suffix':'_bert'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

Map (num_proc=4):   0%|          | 0/50000 [00:00<?, ? examples/s]

In [37]:
# back to dataframe for later analysis
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)

In [38]:
df_results = df.merge(results_df, left_index=True, right_index=True)
df_results.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label,data_scientist_bert,data_engineer_bert,data_analyst_bert,ml_engineer_bert,other_bert
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]",1,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]",1,1,0,1,0


## Set up a model with NO pre-trained data

* Uses an LLM to classify data into a given set of categories
* No pre-training required (but probably means worse outputs)
* When setting up the pipeline can play with device value I think to make it faster/optimize our space a bit better

In [17]:
# next build in the facebook method 
labels_original

# zero shot classifier for non-trained data
llm_classifier = pipeline(
    'zero-shot-classification', 
    model='facebook/bart-large-mnli', 
    num_workers=0, # for debugging crashing in the DSVM (disables multiprocessing)
    device=-1 # 0 for GPUs I think?
)

# a not option
text = 'I am not a data scientist'

# directly trained on
# text = 'ML Ops Specialist, ML Engineer'

# unrelated 
# text = "I'm a librarian. Why'd I get this survey?"

out = llm_classifier(
    text, 
    # extra inputs for the LLM classifier
    labels_original, 
    multi_label=True
)
out

Device set to use cpu


{'sequence': 'I am not a data scientist',
 'labels': ['Other',
  'Response',
  'ML Engineer',
  'Data Scientist',
  'Data Engineer',
  'Data Analyst'],
 'scores': [0.9456850290298462,
  0.45185422897338867,
  0.002592480042949319,
  0.001965784467756748,
  0.001021318370476365,
  0.0006961253238841891]}

In [40]:
# size of model
n_params = llm_classifier.model.num_parameters()
n_mb = n_params * 4 / (1024**2)
print(f'Params:    {n_params:,}\nAppx Size: {n_mb}')

Params:    407,344,131
Appx Size: 1553.8945426940918


In [20]:
# do across entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use (works on workstation)
num_proc = 1 # crashes for the basic DSVM, so testing different options

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch_llm,
    fn_kwargs={'classifier': llm_classifier, 'labels': labels_original, 'threshold': threshold, 'suffix':'_llm'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [21]:
# back to dataframe for later analysis
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)
results_df.head()

Unnamed: 0,response_llm,data_scientist_llm,data_engineer_llm,data_analyst_llm,ml_engineer_llm,other_llm
0,1,0,0,0,1,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,1,1,1,0,1,0


In [None]:
df_results_complete = df_results.merge(results_df, left_index=True, right_index=True)
df_results_complete.head()

## Final Comparison of Outputs

* Look at how well the model did at:
  * Getting the outputs exactly matched to the initial data
  * Getting the outputs exactly matched, but added in an extra category
  * Adding at least one category that wasn't in initial data
  * Missing at least one category that wasn't in initial data

In [46]:
labels
labels_bert = [x+'_bert' for x in labels]

df_results_complete['all_right_bert'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_bert))
df_results_complete['right_plus_extra_bert'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_bert))
df_results_complete['added_one_bert'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_bert))
df_results_complete['missed_one_bert'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_bert))

#labels_llm = [x+'_llm' for x in labels]
#df_results_complete['all_right_llm'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_llm))
#df_results_complete['right_plus_extra_llm'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_llm))
#df_results_complete['added_one_llm'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_llm))
#df_results_complete['missed_one_llm'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_llm))

NameError: name 'df_results_complete' is not defined

In [None]:
df_summary = df_results_complete[
    [
        'all_right_bert', 'right_plus_extra_bert', 'added_one_bert', 'missed_one_bert',
        'all_right_llm',  'right_plus_extra_llm',  'added_one_llm',  'missed_one_llm'
    ]
].sum().reset_index()
df_summary.columns = ['metric', 'n']
df_summary['pct'] = df_summary['n']/len(df_results_complete)
df_summary

In [None]:
suffix = 'bert'
df_results_complete[~df_results_complete[f'all_right_{suffix}']][['response']+labels+[x+'_'+suffix for x in labels]]

In [None]:
suffix = 'llm'
df_results_complete[~df_results_complete[f'all_right_{suffix}']][['response']+labels+[x+'_'+suffix for x in labels]]