In [1]:
CLASSIFY_TARGET = 'job_search_status'
PROMPT_VERSION = 1

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
import plotly

device = torch.device("cuda")

## Get Data

In [2]:
# Get label encodings
import yaml
with open('label_maps.yaml', 'r') as file:
    yaml_dict = [x for x in yaml.safe_load(file)['v' + str(PROMPT_VERSION)] if x['model'] == CLASSIFY_TARGET][0]

label_encoding_map = pd.DataFrame.from_dict(yaml_dict['map'])
test_examples = yaml_dict['examples']

In [3]:
# Get data
import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv(Path().absolute() / '.env', override = True)

def get_query(query: str) -> pd.DataFrame: 
    engine = create_engine(
        "postgresql+psycopg2://{user}:{password}@{host}/{dbname}".format(
           dbname = os.getenv('DB_DATABASE'),
           user = os.getenv('DB_USERNAME'),
           password = os.getenv('DB_PASSWORD'),
           host = os.getenv('DB_SERVER')
        )
    )
    
    pg = engine.connect()
    res = pd.read_sql(query, con = pg)
    pg.close()
    
    return res

input_data = get_query(
    f"""
    WITH t0 AS (
        SELECT 
            a.post_id, a.label_value AS value, a.label_rationale AS rationale,
            CONCAT(TRIM(b.title), '\n', REGEXP_REPLACE(TRIM(b.selftext), '[\t\n\r]', ' ', 'g')) AS input_text
        FROM text_scraper_reddit_llm_scores a
        INNER JOIN text_scraper_reddit_scrape b
            ON a.scrape_id = b.id
        WHERE a.prompt_version = 1 AND a.label_key = '{CLASSIFY_TARGET}'
    )
    -- Select where expected token count <= 512
    SELECT * FROM t0
    WHERE ARRAY_LENGTH(REGEXP_SPLIT_TO_ARRAY(input_text, '\\s+'), 1) * 1.5 <= 512
    """
    ).merge(label_encoding_map, how = 'inner', on = 'value')

input_data

Unnamed: 0,post_id,value,rationale,input_text,label_encode
0,t3_118xr9s,searching/considering search,The user is considering going back to the same...,Would you go back to a company you had already...,1
1,t3_rwpsfy,searching/considering search,The user expresses frustration with talent acq...,"MY RESUME OUTLINES MY EXPERIENCE, SO DO NOT SE...",1
2,t3_10p8ox3,searching/considering search,The user is considering leaving their current ...,Wait to be fired or quit?\nLong story short—I ...,1
3,t3_17japq3,searching/considering search,The user is considering leaving their current ...,"My boss is always shouting at me\nHello, I jus...",1
4,t3_vh2pl2,searching/considering search,The user mentions struggling to find a part-ti...,How the hell do I get a job as a 15 year old w...,1
...,...,...,...,...,...
8210,t3_pjb07w,received offer/started new job,The user has accepted a full-time position at ...,What advice do you have on healing from a toxi...,0
8211,t3_ypqi89,received offer/started new job,The user mentions accepting a new job within t...,Got a new job and agreed to start January 2nd ...,0
8212,t3_oyjw7m,received offer/started new job,The user mentions receiving a job offer.,"Job search finally over!\nAfter many months, m...",0
8213,t3_zedntw,received offer/started new job,The user has received a job offer.,Should I accept a job if the pay is $14k less ...,0


## Get Base Model

In [4]:
# from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AlbertTokenizer, AlbertForSequenceClassification

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = len(label_encoding_map)).to(device)

# Num params
sum(p.numel() for p in model.parameters())

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11685891

## Create Datasets & Loader

In [5]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Batch size
B = 16

class TextDataset(Dataset):
    
    def __init__(self, tokenizer, texts, labels = None):
        self.tokenizer = tokenizer
        self.texts = texts
        self.inputs = self.tokenize_and_encode()
        
        if labels is not None:
            self.labels = torch.tensor(labels, dtype = torch.long)
        else:
            self.labels = None
        
    def __len__(self):
        return len(self.texts)
        
    def tokenize_and_encode(self):
        return self.tokenizer(
            self.texts,
            add_special_tokens = True,
            max_length = 512,
            truncation = True,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
    
    def __getitem__(self, idx):
        item = {key: vals[idx] for key, vals in self.inputs.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

train_df, val_df = train_test_split(input_data, test_size = .2, random_state = 1)
train_ds, val_ds = [TextDataset(tokenizer, x['input_text'].tolist(), x['label_encode'].tolist()) for x in [train_df, val_df]]
train_dl, val_dl = [DataLoader(x, batch_size = B) for x in [train_ds, val_ds]]

In [6]:
# 0 = received offer/started new job
# 1 = searching/considering search
# 2 = not/unknown
# print(val_ds.texts[6], val_ds.labels[6])

In [7]:
val_df.head(1)

Unnamed: 0,post_id,value,rationale,input_text,label_encode
74,t3_171ap78,searching/considering search,The user mentions feeling unhappy in their cur...,I f-ed up so bad by relocating\nIt's been a mo...,1


## Create Eval Functions

In [8]:
torch.cuda.empty_cache()

@torch.no_grad()
def eval_performance_on_ds(model, ds, batch_size = 16):
    """
    Test model performance on evaluation set.
    """
    model.eval()
    total_obs = 0
    total_correct = 0
    nlls = []

    dl = DataLoader(ds, batch_size = batch_size)
    
    for step, b in tqdm(enumerate(dl)):
        outputs = model(b['input_ids'].to(device), b['attention_mask'].to(device))
        logits = outputs['logits'].cpu()
        label_ids = b['labels'].cpu()
        
        total_obs += len(label_ids)
        total_correct += np.sum(np.where(np.argmax(logits, axis = 1) == label_ids, 1, 0))
        nlls.append(F.cross_entropy(logits, label_ids))
    
    res = {'mean_nll': np.mean(nlls), 'accuracy': total_correct/total_obs, 'count': total_obs}
    return res

eval_performance_on_ds(model, val_ds, batch_size = 1)

1643it [00:36, 45.16it/s]


{'mean_nll': 0.9460146, 'accuracy': 0.751065124771759, 'count': 1643}

In [9]:
@torch.no_grad()
def eval_performance_on_examples(model, examples):
    """
    Run inference on a handful of predefined examples. Returns a printable string.
    """
    model.eval()
    total_correct = 0
    str = ''
    
    inference_examples = [x['text'] for x in test_examples]
    labels = [x['label'] for x in test_examples]
    inference_dl = DataLoader(TextDataset(tokenizer, inference_examples, labels), 1)
    
    for i, b in enumerate(inference_dl):
        out = model(b['input_ids'].to(device), b['attention_mask'].to(device))
        softmax = F.softmax(out['logits'].detach().cpu().flatten(), dim = 0)
        label = b['labels'].cpu()

        is_correct = 1 if np.argmax(softmax) == label else 0
        total_correct = total_correct + is_correct
        str += (f'\n{"✅" if is_correct == 1 else "❌"} {softmax[label].numpy().round(2)} - {inference_examples[i]}')

    return f"Correct: {total_correct}/{len(inference_examples)}" + str
    
print(eval_performance_on_examples(model, test_examples))

Correct: 2/6
✅ [0.59] - Started searching for jobs and struggling
❌ [0.24] - Just received a new job offer!
❌ [0.21] - My job is paying my coworker more than me. What do I do?
❌ [0.14] - Recruiters really tick me off
Its 9am on a weekend and I'm already ticked off. I came across a post on LinkedIn that said recruiters should dress up as ghosts because they ghost candidates. Do recruiters really think this kind of stuff is funny?
✅ [0.6] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
❌ [0.31] - Mind blowing "counter" from employer
So I'm officially employed as a sales rep on $47k/year, but I've been doing the responsibilities and tasks of the sales manager AND operations manager all year. Both of these official positions have technically been available, but my boss 

In [10]:
print([x/1e9 for x in torch.cuda.mem_get_info()])
torch.cuda.empty_cache()
[x/1e9 for x in torch.cuda.mem_get_info()]

[6.5011712, 8.589410304]


[6.555697152, 8.589410304]

## Train

In [11]:
EPOCHS = 20
# TRY LOWERING TO 1e-5 or 2e-5
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-5) # 5e-5
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.5)                                                                                                  
train_losses = []
val_losses = []
f = plotly.graph_objects.FigureWidget().add_scatter(x = [], y = [])
display(f)

model.train()
for epoch in range(EPOCHS):
    print(f'***** Epoch {epoch} ')
    torch.cuda.empty_cache() 
    epoch_loss = 0
    
    for step, batch in tqdm(enumerate(train_dl)):
        optimizer.zero_grad()

        outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        loss = F.cross_entropy(outputs['logits'], batch['labels'].to(device))
        loss.backward()
        
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Logging and eval
        epoch_loss = epoch_loss + loss.item()
        train_losses.append((epoch, step, loss.item()))

        if step % 100 == 0 and (step > 0 or epoch > 0): 
            print(
                f"Step {step}/{len(train_dl)} | " +
                f"Last 50 batch train NLL: {np.mean([t[2] for t in train_losses][-50:])} | " +
                f"LR: {optimizer.param_groups[0]['lr']} | " + 
                f"Mem: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB"
            )
            
            model.eval()
            examples_res = eval_performance_on_examples(model, test_examples)
            print(examples_res)
            model.train()

            losses_df =\
                pd.DataFrame(train_losses, columns = ['epoch', 'step', 'train_loss'])\
                .assign(i = lambda df: range(len(df)), train_loss_roll = lambda df: df['train_loss'].rolling(window = 50).mean())\
                .dropna(axis = 0)
            f.data[0].x = losses_df['i'].tolist()
            f.data[0].y = losses_df['train_loss_roll'].tolist()

    scheduler.step()
    
    # Logging and eval
    print(f'Epoch loss: {epoch_loss}')
    model.eval()
    eval_res = eval_performance_on_ds(model, val_ds)
    print(eval_res)
        
    model.train()
    val_losses.append({**{'epoch': epoch}, **eval_res})
        
    # Export to TorchScript    
    for ex in val_dl:
        ex

    model_scripted = torch.jit.trace(model, (ex['input_ids'].to(device), ex['attention_mask'].to(device)), strict = False)
    model_scripted.save(f"saves/model_{CLASSIFY_TARGET}_epoch_{str(epoch).rjust(3, '0')}.pt") # Save


FigureWidget({
    'data': [{'type': 'scatter', 'uid': 'db036611-2782-41e8-ae72-000e5447a1a2', 'x': [], 'y': []}],
    'layout': {'template': '...'}
})

***** Epoch 0 


11it [01:26,  7.84s/it]


KeyboardInterrupt: 

In [None]:
for ex in val_dl:
    ex

model_scripted = torch.jit.trace(model, (ex['input_ids'].to(device), ex['attention_mask'].to(device)), strict = False)
# Export to TorchScript
model_scripted.save(f"saves/model_{CLASSIFY_TARGET}_epoch_{str(epoch).rjust(3, '0')}.pt") # Save

In [None]:
val_ds[0]['attention_mask']

In [None]:
val_ds[0]['input_ids']

In [None]:
loaded = torch.jit.load(f'saves/save_{CLASSIFY_TARGET}.pt')
loaded.eval()
loaded(ex['input_ids'].to(device), ex['attention_mask'].to(device))

In [None]:
# Print performance log
# import plotly.express as px

# losses_df =\
#     pd.DataFrame(train_losses, columns = ['epoch', 'iter', 'train_loss'])\
#     .assign(train_loss_roll = lambda df: df['train_loss'].rolling(window = 2).mean())\
#     .melt(value_vars = ['train_loss_roll'], id_vars = 'iter')\
#     .dropna(axis = 0)\
#     .reset_index(drop = True)
#     .merge(pd.DataFrame(test_losses, columns = ['iter', 'test_loss']), how = 'left', on = 'iter')\

# display(px.scatter(losses_df, x = 'iter', y = 'value', color = 'variable'))

In [None]:
# from peft import LoraConfig, TaskType, get_peft_model

# lora_config = LoraConfig(
#     task_type = TaskType.SEQ_CLS, r = 4, lora_alpha = 1, lora_dropout = 0.1
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# print(model.roberta.encoder.layer[0].attention.self)