In [1]:
# system stuff
import sys
import os

# the usual
import pandas as pd

# model stuff 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline

# my stuff (abstracted non-important functions)
# Get the project root (one level up from notebooks)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.prepare_data import get_labels_list, create_train_test_dataframes, tokenize_data, create_hf_dataset
from src.evaluate_data import compute_metrics, classify_batch, classify_batch_llm, all_right, right_plus_extra, added_one, missed_one
from src.config import data_path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(data_path)
labels_original = list(df.columns)
df.columns = [x.lower().replace(' ','_') for x in df.columns]

In [3]:
labels_original

['Response',
 'Data Scientist',
 'Data Engineer',
 'Data Analyst',
 'ML Engineer',
 'Other']

In [4]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0
2,System Administrator,0,0,0,0,1
3,IT Support Specialist,0,0,0,0,1
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0


In [5]:
# convert to a multi labeled dataset for use with huggingface
df['label'] = df.apply(get_labels_list, axis=1)

In [6]:
# create list of categories
labels = list(df.columns[1:6])
display(labels)

# create id2labels to use in the modelling step
id2label = {ii: label for ii, label in enumerate(labels)}
label2id = {label: idx for idx, label in id2label.items()}

['data_scientist', 'data_engineer', 'data_analyst', 'ml_engineer', 'other']

In [7]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]"
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]"


In [8]:
# let's chunk off the majority of this data to never be used in training
# this most matches what we would like our process to look like
n_train = 1_000
df_train, df_test = create_train_test_dataframes(df, n_train=n_train)

In [9]:
# use distillbert to tokenize the data (a smaller version of bert)
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [10]:
# convert from pandas to a huggine face dataset to best utilize their tools
# this will also tokenize the data
dataset_train = create_hf_dataset(df_train, tokenizer)
dataset_test = create_hf_dataset(df_test, tokenizer)
dataset = create_hf_dataset(df, tokenizer)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 5503.36 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49000/49000 [00:08<00:00, 5766.77 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:08<00:00, 5870.18 examples/s]


In [11]:
# train test split the data for model validation
# the 'test' data from this step will be used as validation data in the modelling step
dataset_train = dataset_train.train_test_split(test_size = 0.3)

In [12]:
dataset

Dataset({
    features: ['response', 'label', 'input_ids', 'attention_mask'],
    num_rows: 50000
})

In [13]:
# set up a model for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", 
    num_labels=len(labels),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# set up trainer
model_path = '../models/my_test_model'
training_args = TrainingArguments(
    
    # save params
    output_dir=model_path, # save final model
    eval_strategy="epoch",        # make evaluations at end of each epoch
    save_strategy='epoch',        # save checkpoints every epoch
    load_best_model_at_end=True,   # save best model at end
    
    # learning params
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    
    # calibrate machine params 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    
    # logs
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics # computes accuracy and f1 score
)
    

In [16]:
# train our model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3852,0.383795,0.356667,0.804196
2,0.2418,0.228149,0.803333,0.931734
3,0.1854,0.179069,0.883333,0.962547


TrainOutput(global_step=525, training_loss=0.33888329040436516, metrics={'train_runtime': 1646.458, 'train_samples_per_second': 1.275, 'train_steps_per_second': 0.319, 'total_flos': 278196420096000.0, 'train_loss': 0.33888329040436516, 'epoch': 3.0})

In [17]:
# save (if didn't earlier)
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('../models/my_test_model\\tokenizer_config.json',
 '../models/my_test_model\\special_tokens_map.json',
 '../models/my_test_model\\vocab.txt',
 '../models/my_test_model\\added_tokens.json',
 '../models/my_test_model\\tokenizer.json')

In [None]:
# # note: you can reload this model either from a checkpoint or from the final saved model
# # to continue training (back into a new Trainer() instance)

# # examples:

# # 1. CHECKPOINT 
# # load from checkpoint
# checkpoint = f'{model_path}/checkpoint-500'
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# # re-initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args, # same as above
#     train_dataset=dataset['train'],
#     eval_dataset=dataset['test']
# )

# # resume - this preservers optimizer states, learning rate scheduler, and epoch counters
# trainer.train(resume_from_checkpoint=True)

# # 2. NEW TRAINING FROM SAVED MODEL
# # Load the saved model (not a checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)

# # New training arguments (can modify as needed)
# new_training_args = TrainingArguments(
#     output_dir="./continued_training",
#     # update save params, learning hyperparams, machine params, log params as you want here
# )

# # Re-initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=new_training_args,
#     train_dataset=new_train_dataset,  # Can be the same or new dataset
#     eval_dataset=new_val_dataset
# )

# # Start training from the loaded model
# trainer.train()


In [18]:
# evaluate
results_train = trainer.evaluate(eval_dataset=dataset_train)
results_test = trainer.evaluate(eval_dataset=dataset_test)
print(results_train)
print(results_test)

{'eval_train_loss': 0.16505472362041473, 'eval_train_accuracy': 0.9342857142857143, 'eval_train_f1': 0.9805383022774327, 'eval_train_runtime': 126.4466, 'eval_train_samples_per_second': 5.536, 'eval_train_steps_per_second': 1.384, 'epoch': 3.0, 'eval_test_loss': 0.17906904220581055, 'eval_test_accuracy': 0.8833333333333333, 'eval_test_f1': 0.9625468164794008, 'eval_test_runtime': 54.4846, 'eval_test_samples_per_second': 5.506, 'eval_test_steps_per_second': 1.377}
{'eval_loss': 0.17288607358932495, 'eval_accuracy': 0.8989183673469388, 'eval_f1': 0.9692760350729123, 'eval_runtime': 8942.8914, 'eval_samples_per_second': 5.479, 'eval_steps_per_second': 1.37, 'epoch': 3.0}


In [19]:
# inference
my_classifier = pipeline(
    'text-classification', 
    model=model_path, 
    top_k=None, # needed for multi label 
    device=-1    # will use a GPU if available and set to 0
)

# note this did poorly on 'not', but there are options for dealing with that
text = 'I am not a data scientist'

# directly trained on
text = 'ML Ops Specialist, ML Engineer'

# unrelated 
text = "I'm a librarian. Why'd I get this survey?"

out = my_classifier(text)
out

Device set to use cpu


[[{'label': 'other', 'score': 0.9139354825019836},
  {'label': 'data_scientist', 'score': 0.04686310514807701},
  {'label': 'data_engineer', 'score': 0.04504388943314552},
  {'label': 'ml_engineer', 'score': 0.03771831467747688},
  {'label': 'data_analyst', 'score': 0.03612220659852028}]]

In [20]:
df.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]"
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]"
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]"
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]"


In [21]:
# classify across the entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use 

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch,
    fn_kwargs={'classifier': my_classifier, 'threshold': threshold, 'suffix':'_bert'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

Map (num_proc=4): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [10:23<00:00, 80.23 examples/s]


In [22]:
# back to dataframe
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)

In [23]:
df_results = df.merge(results_df, left_index=True, right_index=True)
df_results.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label,data_scientist_bert,data_engineer_bert,data_analyst_bert,ml_engineer_bert,other_bert
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]",1,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]",1,1,0,1,0


In [24]:
# next build in the facebook method 
labels_original

# zero shot classifier for non-trained data
llm_classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

# a not option
text = 'I am not a data scientist'

# directly trained on
# text = 'ML Ops Specialist, ML Engineer'

# unrelated 
# text = "I'm a librarian. Why'd I get this survey?"

out = llm_classifier(
    text, 
    # extra inputs for the LLM classifier
    labels_original, 
    multi_label=True
)
out

Device set to use cpu


{'sequence': 'I am not a data scientist',
 'labels': ['Other',
  'Response',
  'ML Engineer',
  'Data Scientist',
  'Data Engineer',
  'Data Analyst'],
 'scores': [0.9456849694252014,
  0.45185229182243347,
  0.0025925145018845797,
  0.0019657453522086143,
  0.0010213007917627692,
  0.0006961148465052247]}

In [25]:
# do across entire dataset
threshold = 0.5
batch_size = 256 # adjust based on GPU/CPU capacity 
num_proc = 4  # number of CPU cores to use 

# Apply inference in parallel
result_dataset = dataset.map(
    classify_batch_llm,
    fn_kwargs={'classifier': llm_classifier, 'labels': labels_original, 'threshold': threshold, 'suffix':'_llm'},
    batched=True,             
    batch_size=batch_size,           
    num_proc=num_proc
)

Map (num_proc=4): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [7:35:24<00:00,  1.83 examples/s]


In [26]:
# back to dataframe
results_df = result_dataset.to_pandas().drop(['response', 'label', 'input_ids', 'attention_mask'], axis=1)
results_df.head()

Unnamed: 0,response_llm,data_scientist_llm,data_engineer_llm,data_analyst_llm,ml_engineer_llm,other_llm
0,1,0,0,0,1,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,1,1,1,0,1,0


In [27]:
df_results_complete = df_results.merge(results_df, left_index=True, right_index=True)
df_results_complete.head()

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,label,data_scientist_bert,data_engineer_bert,data_analyst_bert,ml_engineer_bert,other_bert,response_llm,data_scientist_llm,data_engineer_llm,data_analyst_llm,ml_engineer_llm,other_llm
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,"[1.0, 0.0, 0.0, 1.0, 0.0]",1,0,0,1,0,1,0,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,"[0.0, 0.0, 1.0, 0.0, 0.0]",0,0,1,0,0,0,0,0,0,0,1
2,System Administrator,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1,0,0,0,0,0,1
3,IT Support Specialist,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0]",0,0,0,0,1,0,0,0,0,0,1
4,"ML Engineer, ML Ops Specialist, Big Data Engin...",1,1,0,1,0,"[1.0, 1.0, 0.0, 1.0, 0.0]",1,1,0,1,0,1,1,1,0,1,0


In [28]:
labels
labels_bert = [x+'_bert' for x in labels]

df_results_complete['all_right_bert'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_bert))
df_results_complete['right_plus_extra_bert'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_bert))
df_results_complete['added_one_bert'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_bert))
df_results_complete['missed_one_bert'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_bert))

labels_llm = [x+'_llm' for x in labels]
df_results_complete['all_right_llm'] = df_results_complete.apply(all_right, axis=1, args=(labels, labels_llm))
df_results_complete['right_plus_extra_llm'] = df_results_complete.apply(right_plus_extra, axis=1, args=(labels, labels_llm))
df_results_complete['added_one_llm'] = df_results_complete.apply(added_one, axis=1, args=(labels, labels_llm))
df_results_complete['missed_one_llm'] = df_results_complete.apply(missed_one, axis=1, args=(labels, labels_llm))

In [29]:
df_summary = df_results_complete[
    ['all_right_bert', 'right_plus_extra_bert', 'added_one_bert', 'missed_one_bert',
     'all_right_llm',  'right_plus_extra_llm',  'added_one_llm',  'missed_one_llm']
].sum().reset_index()
df_summary.columns = ['metric', 'n']
df_summary['pct'] = df_summary['n']/len(df_results_complete)
df_summary

Unnamed: 0,metric,n,pct
0,all_right_bert,45017,0.90034
1,right_plus_extra_bert,292,0.00584
2,added_one_bert,4913,0.09826
3,missed_one_bert,652,0.01304
4,all_right_llm,17578,0.35156
5,right_plus_extra_llm,8788,0.17576
6,added_one_llm,20799,0.41598
7,missed_one_llm,25426,0.50852


In [30]:
suffix = 'bert'
df_results_complete[~df_results_complete[f'all_right_{suffix}']][['response']+labels+[x+'_'+suffix for x in labels]]

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,data_scientist_bert,data_engineer_bert,data_analyst_bert,ml_engineer_bert,other_bert
7,"As a Big Data Engineer, Business Intelligence ...",0,1,1,1,0,1,1,1,1,0
15,"Machine Learning Engineer, Data Engineer",0,1,0,1,0,1,1,0,1,0
23,"I work as a ETL Developer, Analytics Consultan...",0,1,1,1,0,1,1,1,1,0
39,"AI Developer, Data Pipeline Specialist, Data A...",0,1,1,1,0,1,1,1,1,0
50,"Reporting Specialist, ML Ops Specialist, Data ...",0,1,1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
49944,Wouldn't trade my job as a Machine Learning En...,0,1,1,1,0,1,1,1,1,0
49967,I take a lot of pride in my work as a I guess ...,0,1,0,1,0,1,1,0,1,0
49976,"ETL Developer, Data Analyst, Machine Learning ...",0,1,1,1,0,1,1,1,1,0
49978,"Data Analyst, Big Data Engineer, Deep Learning...",0,1,1,1,0,1,1,1,1,0


In [31]:
suffix = 'llm'
df_results_complete[~df_results_complete[f'all_right_{suffix}']][['response']+labels+[x+'_'+suffix for x in labels]]

Unnamed: 0,response,data_scientist,data_engineer,data_analyst,ml_engineer,other,data_scientist_llm,data_engineer_llm,data_analyst_llm,ml_engineer_llm,other_llm
0,"ML Ops Specialist, ML Engineer",1,0,0,1,0,0,0,0,1,0
1,Analytics Consultant (at least that's what my ...,0,0,1,0,0,0,0,0,0,1
5,"Analytics Consultant, Data Pipeline Specialist",0,1,1,0,0,1,0,1,0,0
7,"As a Big Data Engineer, Business Intelligence ...",0,1,1,1,0,1,1,1,1,0
8,"Unfortunately, I'm stuck as a ML Engineer, Big...",1,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
49994,"Title says Machine Learning Engineer, ETL Deve...",1,1,0,1,0,0,0,0,1,0
49995,"Big Data Engineer, Data Analyst. Though, I'm t...",0,1,1,0,0,1,1,1,0,0
49996,"AI Researcher, Machine Learning Engineer. Kind...",1,0,0,1,0,0,0,0,1,0
49998,"Reporting Specialist, AI Developer",0,0,1,1,0,0,0,0,0,1
