In [1]:
PROMPT_ID = 'labor_market_v1'
CLASSIFY_TARGET = 'job_search_status'

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly

device = torch.device('cuda')

## Get Data

In [2]:
# Get label encodings
import yaml
with open('label_maps.yaml', 'r') as file:
    yaml_dict = [x for x in yaml.safe_load(file)[PROMPT_ID] if x['model'] == CLASSIFY_TARGET][0]

label_encoding_map = pd.DataFrame.from_dict(yaml_dict['map'])
test_examples = yaml_dict['examples']

In [3]:
# Get data
from helpers.db import get_postgres_query

input_data = get_postgres_query(
    f"""
    WITH t0 AS (
        SELECT 
            a.post_id, a.label_value AS value, a.label_rationale AS rationale,
            CONCAT(TRIM(b.title), '\n', REGEXP_REPLACE(TRIM(b.selftext), '[\t\n\r]', ' ', 'g')) AS input_text
        FROM text_scraper_reddit_llm_scores_v2 a
        INNER JOIN text_scraper_reddit_scrapes b
            ON a.scrape_id = b.scrape_id
        WHERE a.PROMPT_ID = '{PROMPT_ID}' AND a.label_key = '{CLASSIFY_TARGET}'
    )
    -- Select where expected token count <= 512
    SELECT * FROM t0
    WHERE ARRAY_LENGTH(REGEXP_SPLIT_TO_ARRAY(input_text, '\\s+'), 1) * 1.5 <= 512
    """
    ).merge(label_encoding_map, how = 'inner', on = 'value')

input_data

Unnamed: 0,post_id,value,rationale,input_text,label_encode
0,t3_118xr9s,searching/considering search,The user is considering going back to the same...,Would you go back to a company you had already...,1
1,t3_rwpsfy,searching/considering search,The user expresses frustration with talent acq...,"MY RESUME OUTLINES MY EXPERIENCE, SO DO NOT SE...",1
2,t3_10p8ox3,searching/considering search,The user is considering leaving their current ...,Wait to be fired or quit?\nLong story short—I ...,1
3,t3_17japq3,searching/considering search,The user is considering leaving their current ...,"My boss is always shouting at me\nHello, I jus...",1
4,t3_vh2pl2,searching/considering search,The user mentions struggling to find a part-ti...,How the hell do I get a job as a 15 year old w...,1
...,...,...,...,...,...
12124,b5577o,received offer/started new job,The user mentions receiving two job offers wit...,Careers working in environmental conservation/...,0
12125,dguen7,received offer/started new job,The user mentions accepting a new job offer an...,"Just accepted a job offer, and will be giving ...",0
12126,t3_qzwqah,received offer/started new job,"The user mentions receiving a job offer with ""...","After 2 months, I finally got an offer!!\nFor ...",0
12127,t3_xgmjf0,received offer/started new job,The user mentions receiving a job offer.,Recruiter is pressuring me to accept the job b...,0


## Get Base Model

In [4]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification
# from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = len(label_encoding_map)).to(device)

# Num params
sum(p.numel() for p in model.parameters())

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11685891

## Create Datasets & Loader

In [5]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Batch size
B = 16

class TextDataset(Dataset):
    
    def __init__(self, tokenizer, texts, labels = None):
        self.tokenizer = tokenizer
        self.texts = texts
        self.inputs = self.tokenize_and_encode()
        
        if labels is not None:
            self.labels = torch.tensor(labels, dtype = torch.long)
        else:
            self.labels = None
        
    def __len__(self):
        return len(self.texts)
        
    def tokenize_and_encode(self):
        return self.tokenizer(
            self.texts,
            add_special_tokens = True,
            max_length = 512,
            truncation = True,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
    
    def __getitem__(self, idx):
        item = {key: vals[idx] for key, vals in self.inputs.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

train_df, val_df = train_test_split(input_data, test_size = .2, random_state = 1)
train_ds, val_ds = [TextDataset(tokenizer, x['input_text'].tolist(), x['label_encode'].tolist()) for x in [train_df, val_df]]
train_dl, val_dl = [DataLoader(x, batch_size = B) for x in [train_ds, val_ds]]

In [6]:
# 0 = received offer/started new job
# 1 = searching/considering search
# 2 = not/unknown
# print(val_ds.texts[6], val_ds.labels[6])

In [7]:
val_df.head(1)

Unnamed: 0,post_id,value,rationale,input_text,label_encode
3780,t3_y33u61,searching/considering search,The user is considering how future employers w...,Recently Fired for poor performance. Will futu...,1


## Create Eval Functions

In [8]:
torch.cuda.empty_cache()

@torch.no_grad()
def eval_performance_on_ds(model, ds, batch_size = 16):
    """
    Test model performance on evaluation set.
    """
    model.eval()
    total_obs = 0
    total_correct = 0
    nlls = []

    dl = DataLoader(ds, batch_size = batch_size, shuffle = True)
    
    for step, b in tqdm(enumerate(dl)):
        outputs = model(b['input_ids'].to(device), b['attention_mask'].to(device))
        logits = outputs['logits'].cpu()
        label_ids = b['labels'].cpu()
        
        total_obs += len(label_ids)
        total_correct += np.sum(np.where(np.argmax(logits, axis = 1) == label_ids, 1, 0))
        nlls.append(F.cross_entropy(logits, label_ids))
    
    res = {'mean_nll': np.mean(nlls), 'accuracy': total_correct/total_obs, 'count': total_obs}
    return res

eval_performance_on_ds(model, val_ds, batch_size = 1)

2426it [00:52, 46.28it/s]


{'mean_nll': 0.8204346, 'accuracy': 0.7765869744435284, 'count': 2426}

In [9]:
@torch.no_grad()
def eval_performance_on_examples(model, examples):
    """
    Run inference on a handful of predefined examples. Returns a printable string.
    """
    model.eval()
    total_correct = 0
    str = ''
    
    inference_examples = [x['text'] for x in test_examples]
    labels = [x['label'] for x in test_examples]
    inference_dl = DataLoader(TextDataset(tokenizer, inference_examples, labels), 1, shuffle = True)
    
    for i, b in enumerate(inference_dl):
        out = model(b['input_ids'].to(device), b['attention_mask'].to(device))
        softmax = F.softmax(out['logits'].detach().cpu().flatten(), dim = 0)
        label = b['labels'].cpu()

        is_correct = 1 if np.argmax(softmax) == label else 0
        total_correct = total_correct + is_correct
        str += (f'\n{"✅" if is_correct == 1 else "❌"} {softmax[label].numpy().round(2)} - {inference_examples[i]}')

    return f"Correct: {total_correct}/{len(inference_examples)}" + str
    
print(eval_performance_on_examples(model, test_examples))

Correct: 2/6
✅ [0.57] - Started searching for jobs and struggling
❌ [0.21] - Just received a new job offer!
❌ [0.25] - My job is paying my coworker more than me. What do I do?
❌ [0.21] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
❌ [0.26] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.49] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss

In [10]:
print([x/1e9 for x in torch.cuda.mem_get_info()])
torch.cuda.empty_cache()
[x/1e9 for x in torch.cuda.mem_get_info()]

[6.5011712, 8.589410304]


[6.555697152, 8.589410304]

## Train

In [11]:
# Saves directory
from datetime import date
import os

save_dir = f"saves/{str(PROMPT_ID).rjust(2, '0')}-{CLASSIFY_TARGET}-{date.today().strftime('%Y%m%d')}"
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

In [12]:
EPOCHS = 10

# TRY LOWERING TO 1e-5 or 2e-5
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-5) # 5e-5
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.5)                                                                                                  
train_losses = []
val_losses = []

f = plotly.graph_objects.FigureWidget().add_scatter(x = [], y = [])
display(f)

model.train()
for epoch in range(EPOCHS):
    print(f'***** Epoch {epoch} ')
    torch.cuda.empty_cache() 
    epoch_loss = 0
    
    for step, batch in tqdm(enumerate(train_dl)):
        optimizer.zero_grad()

        outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        loss = F.cross_entropy(outputs['logits'], batch['labels'].to(device))
        loss.backward()
        
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Logging and eval
        epoch_loss = epoch_loss + loss.item()
        train_losses.append((epoch, step, loss.item()))

        if step % 100 == 0 and (step > 0 or epoch > 0): 
            print(
                f"Step {step}/{len(train_dl)} | " +
                f"Last 50 batch train NLL: {np.mean([t[2] for t in train_losses][-50:])} | " +
                f"LR: {optimizer.param_groups[0]['lr']} | " + 
                f"Mem: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB"
            )
            
            examples_res = eval_performance_on_examples(model, test_examples)
            print(examples_res)
            model.train()

            losses_df =\
                pd.DataFrame(train_losses, columns = ['epoch', 'step', 'train_loss'])\
                .assign(i = lambda df: range(len(df)), train_loss_roll = lambda df: df['train_loss'].rolling(window = 50).mean())\
                .dropna(axis = 0)
            f.data[0].x = losses_df['i'].tolist()
            f.data[0].y = losses_df['train_loss_roll'].tolist()

    scheduler.step()
    
    # Logging and eval
    print(f'Epoch loss: {epoch_loss}')
    eval_res = eval_performance_on_ds(model, val_ds)
    model.train()
    
    val_losses.append({**{'epoch': epoch}, **eval_res})
        
    # Save ts and ckpt    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses
    }, f"{save_dir}/epoch_{str(epoch).rjust(3, '0')}.ckpt")
    
    for ex in val_dl:
        ex
    model_scripted = torch.jit.trace(model, (ex['input_ids'].to(device), ex['attention_mask'].to(device)), strict = False)
    model_scripted.save(f"{save_dir}/epoch_{str(epoch).rjust(3, '0')}.pt") # Save


FigureWidget({
    'data': [{'type': 'scatter', 'uid': '16021d60-83f4-4ef4-adf7-b745e1b2347f', 'x': [], 'y': []}],
    'layout': {'template': '...'}
})

***** Epoch 0 


100it [11:11,  6.70s/it]

Step 100/607 | Last 50 batch train NLL: 0.6348839536309242 | LR: 3e-05 | Mem: 0.2 GB


101it [11:18,  6.77s/it]

Correct: 2/6
✅ [0.75] - Started searching for jobs and struggling
✅ [0.72] - Just received a new job offer!
❌ [0.06] - My job is paying my coworker more than me. What do I do?
❌ [0.12] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
❌ [0.03] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
❌ [0.17] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss

200it [22:26,  7.25s/it]

Step 200/607 | Last 50 batch train NLL: 0.6055583009123802 | LR: 3e-05 | Mem: 0.2 GB


201it [22:34,  7.53s/it]

Correct: 2/6
❌ [0.26] - Started searching for jobs and struggling
❌ [0.01] - Just received a new job offer!
✅ [0.77] - My job is paying my coworker more than me. What do I do?
✅ [0.69] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
❌ [0.2] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
❌ [0.04] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss 

300it [36:21, 14.86s/it]

Step 300/607 | Last 50 batch train NLL: 0.6209905362129211 | LR: 3e-05 | Mem: 0.2 GB


301it [36:38, 15.46s/it]

Correct: 2/6
✅ [0.78] - Started searching for jobs and struggling
❌ [0.03] - Just received a new job offer!
❌ [0.02] - My job is paying my coworker more than me. What do I do?
❌ [0.3] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
❌ [0.2] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.76] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss j

400it [1:01:39, 16.79s/it]

Step 400/607 | Last 50 batch train NLL: 0.5944710719585419 | LR: 3e-05 | Mem: 0.2 GB


401it [1:02:00, 17.93s/it]

Correct: 2/6
❌ [0.03] - Started searching for jobs and struggling
✅ [0.78] - Just received a new job offer!
❌ [0.19] - My job is paying my coworker more than me. What do I do?
❌ [0.2] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
✅ [0.77] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
❌ [0.02] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss 

500it [1:28:03, 14.08s/it]

Step 500/607 | Last 50 batch train NLL: 0.5819393742084503 | LR: 3e-05 | Mem: 0.2 GB


501it [1:28:21, 15.10s/it]

Correct: 2/6
❌ [0.03] - Started searching for jobs and struggling
❌ [0.19] - Just received a new job offer!
✅ [0.78] - My job is paying my coworker more than me. What do I do?
❌ [0.19] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
❌ [0.01] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.78] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss

600it [1:52:03, 14.66s/it]

Step 600/607 | Last 50 batch train NLL: 0.6541057634353638 | LR: 3e-05 | Mem: 0.2 GB


601it [1:52:21, 15.60s/it]

Correct: 2/6
✅ [0.75] - Started searching for jobs and struggling
❌ [0.21] - Just received a new job offer!
❌ [0.07] - My job is paying my coworker more than me. What do I do?
❌ [0.21] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
✅ [0.71] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
❌ [0.03] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss

607it [1:53:42, 11.24s/it]


Epoch loss: 383.15702894330025


137it [22:43,  9.95s/it]


KeyboardInterrupt: 

In [None]:
val_losses

In [None]:
val_ds[0]['input_ids']

In [None]:
# Print performance log
# import plotly.express as px

# losses_df =\
#     pd.DataFrame(train_losses, columns = ['epoch', 'iter', 'train_loss'])\
#     .assign(train_loss_roll = lambda df: df['train_loss'].rolling(window = 2).mean())\
#     .melt(value_vars = ['train_loss_roll'], id_vars = 'iter')\
#     .dropna(axis = 0)\
#     .reset_index(drop = True)
#     .merge(pd.DataFrame(test_losses, columns = ['iter', 'test_loss']), how = 'left', on = 'iter')\

# display(px.scatter(losses_df, x = 'iter', y = 'value', color = 'variable'))

In [None]:
# from peft import LoraConfig, TaskType, get_peft_model

# lora_config = LoraConfig(
#     task_type = TaskType.SEQ_CLS, r = 4, lora_alpha = 1, lora_dropout = 0.1
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# print(model.roberta.encoder.layer[0].attention.self)