In [1]:
PROMPT_ID = 'financial_health_v1'
LABEL_KEY = 'financial_sentiment'

from helpers.loaders import load_config
import torch
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import pandas as pd

device = torch.device('cuda')
label_encoding_map, test_examples = load_config('label_maps.yaml', prompt_id = PROMPT_ID, label_key = LABEL_KEY) # Get config

## Get Base Model

In [2]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification
# from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = len(label_encoding_map)).to(device)

# Num params
sum(p.numel() for p in model.parameters())

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11685891

## Get Data & Configs

In [3]:
# Get data
from helpers.db import get_postgres_query

raw_data = get_postgres_query(
    f"""
    WITH t0 AS (
        SELECT 
            a.post_id, a.label_value AS value, a.label_rationale AS rationale,
            CONCAT(TRIM(b.title), '\n', REGEXP_REPLACE(TRIM(b.selftext), '[\t\n\r]', ' ', 'g')) AS input_text
        FROM text_scraper_reddit_llm_scores_v2 a
        INNER JOIN text_scraper_reddit_scrapes b
            ON a.scrape_id = b.scrape_id
        WHERE 
            a.PROMPT_ID = '{PROMPT_ID}' 
            AND a.label_key = '{LABEL_KEY}'
    )
    -- Select where expected token count <= 512
    SELECT * 
    FROM t0
    WHERE ARRAY_LENGTH(REGEXP_SPLIT_TO_ARRAY(input_text, '\\s+'), 1) * 1.5 <= 512
    """
    ).merge(label_encoding_map, how = 'inner', on = 'value')

# 20k total, 16k train. Overweight low-frequency label to the maximum extent possible.
input_data =\
    raw_data\
    .sample(n = int(16000 * 1/.8), weights = 1/raw_data.groupby('value')['value'].transform('count')) 

In [4]:
input_data.groupby('label_encode').agg(count = ('post_id', 'count'))

Unnamed: 0_level_0,count
label_encode,Unnamed: 1_level_1
0,67
1,67
2,66


In [5]:
# Prep Datasets & Dataloader
from helpers.loaders import TextDataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
B = 16

train_df, test_df = train_test_split(input_data, test_size = .2, random_state = 1234)
train_ds, test_ds = [TextDataset(tokenizer, x['input_text'].tolist(), x['label_encode'].tolist()) for x in [train_df, test_df]]
train_dl, test_dl = [DataLoader(x, batch_size = B) for x in [train_ds, test_ds]]

examples_ds = TextDataset(tokenizer, [x['text'] for x in test_examples], [x['label'] for x in test_examples])
test_df.head(5)

Unnamed: 0,post_id,value,rationale,input_text,label_encode
30158,t3_pyb8qc,strong,The user is financially independent and consid...,Retire now or work another ten years?\nMight n...,1
30072,94g9rr,strong,The user is in a very stable financial situati...,How to invest $100k\nHi. We are in a very st...,1
13462,t3_hyl7v3,neutral,The user is seeking advice on a career change ...,Should I leave my current job for a state job ...,2
25450,t3_wggfim,weak,The user is facing financial uncertainty and p...,What do we do when our only source of income d...,0
29982,t3_innhjv,strong,The user received a significant pay raise by s...,Question about credit cards.\nI have a quick q...,1


## Test Initial Eval

In [6]:
from helpers.loaders import eval_performance, eval_performance_as_str

print(eval_performance(model, test_ds, device, batch_size = 16, verbose = True))
print(eval_performance_as_str(model, examples_ds, device))

3it [00:02,  1.26it/s]


{'mean_nll': 1.0935607, 'accuracy': 0.275, 'count': 40}
Correct: 4/9
❌ [0.35] - Started searching for jobs and struggling
❌ [0.26] - Just received a new job offer with a big pay raise!
✅ [0.38] - My job is paying my coworker more than me. What do I do?
✅ [0.38] - Struggling to manage my credit card debt. I'm in $10k debt and they just keep piling up.
❌ [0.36] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.41] - Best budgeting app to use? Looking for a replacement for Mint.
❌ [0.37] - As a recent college graduate, I thought it wouldn't be that hard to find a job that paid $20 an hour that but I was wrong. There are no entry level jobs in my field or any job that could potentially get my foot in the door pay so little. What is the point of having a college degre

## Train

In [7]:
# Set train confs
conf = {
    'epochs': 5,
    'optim_lr': 2e-5,
    'sched_gamma': .5,
    'sched_steps': 400 # Drops every 400 * 16 iters
}

optimizer = torch.optim.Adam(model.parameters(), lr = conf['optim_lr'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = conf['sched_steps'], gamma = conf['sched_gamma'])                                                                                                  

In [15]:
# Create save dir
from helpers.storage import create_save_dir
import json

save_dir = create_save_dir(prompt_id = PROMPT_ID, label_key = LABEL_KEY)
with open(f'{save_dir}/conf.json', "w") as outfile:
    outfile.write(json.dumps(conf, indent = 4))

print([x/1e9 for x in torch.cuda.mem_get_info()])
torch.cuda.empty_cache()
[x/1e9 for x in torch.cuda.mem_get_info()]

[0.0, 8.589410304]


[6.352273408, 8.589410304]

In [16]:
# Known to work well: 3e-5 with no clipping; 2e-5 with clipping, gamma=.5
train_losses = []
test_losses = []

f = plotly.graph_objects.FigureWidget().add_scatter(name = 'train', x = [], y = []).add_scatter(name = 'test', x = [], y = [])
for e in range(conf['epochs']): f.add_vline(x = e * len(train_dl))
display(f)

for epoch in range(conf['epochs']):
    print(f'***** Epoch {epoch + 1} ')
    torch.cuda.empty_cache() 
    model.train()
    epoch_loss = 0
    
    for step, batch in tqdm(enumerate(train_dl)):
        
        optimizer.zero_grad()
        outputs = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        loss = F.cross_entropy(outputs['logits'], batch['labels'].to(device))
        
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()

        # Logging and eval
        epoch_loss = epoch_loss + loss.item()
        train_losses.append({'epoch': epoch, 'step': step, 'loss': loss.item()})
        if step % 50 == 0 and (step > 0 or epoch > 0): 
            test_eval_res = eval_performance(model, test_ds, device, batch_size = 16, verbose = False)
            test_losses.append({'epoch': epoch, 'step': step, 'loss': test_eval_res['mean_nll']})
            print(
                f"Step {step}/{len(train_dl)} | " +
                f"Train Loss: {np.mean([t['loss'] for t in train_losses][-50:])} | " +
                f"Test Loss: {test_losses[-1]['loss']} | " +
                f"LR: {optimizer.param_groups[0]['lr']} | " + 
                f"RAM: {round(torch.cuda.memory_allocated(0)/1024**3,1)} GB"
            )
            print(eval_performance_as_str(model, examples_ds, device))
            # Update graph
            losses_df = pd.concat([
                pd.DataFrame(train_losses).assign(type = 'train', loss = lambda df: df['loss'].rolling(window = 50).mean()).dropna(axis = 0),
                pd.DataFrame(test_losses).assign(type = 'test')
            ]).assign(i = lambda df: (df['epoch'] * len(train_dl)) + df['step'] + 1)     
            f.data[0].x, f.data[1].x = [losses_df[losses_df['type'] == j]['i'].tolist() for j in ['train', 'test']]
            f.data[0].y, f.data[1].y = [losses_df[losses_df['type'] == j]['loss'].tolist() for j in ['train', 'test']]

    # Save ts and ckpt 
    torch.save({
        'epoch': epoch, 'step': step, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(),
        'train_losses': train_losses, 'test_losses': test_losses
    }, f"{save_dir}/epoch_{str(epoch + 1).rjust(3, '0')}.ckpt")
    
    ex = next(iter(test_dl))
    model_scripted = torch.jit.trace(model, (ex['input_ids'].to(device), ex['attention_mask'].to(device)), strict = False)
    model_scripted.save(f"{save_dir}/epoch_{str(epoch + 1).rjust(3, '0')}.pt")


FigureWidget({
    'data': [{'name': 'train', 'type': 'scatter', 'uid': 'cac16314-a498-4acc-a653-b98434b172ff', 'x': [], 'y': []},
             {'name': 'test', 'type': 'scatter', 'uid': '9aa01bab-2822-4bab-9c0a-239847edce93', 'x': [], 'y': []}],
    'layout': {'shapes': [{'type': 'line', 'x0': 0, 'x1': 0, 'xref': 'x', 'y0': 0, 'y1': 1, 'yref': 'y domain'},
                          {'type': 'line', 'x0': 10, 'x1': 10, 'xref': 'x', 'y0': 0, 'y1': 1, 'yref': 'y domain'},
                          {'type': 'line', 'x0': 20, 'x1': 20, 'xref': 'x', 'y0': 0, 'y1': 1, 'yref': 'y domain'},
                          {'type': 'line', 'x0': 30, 'x1': 30, 'xref': 'x', 'y0': 0, 'y1': 1, 'yref': 'y domain'},
                          {'type': 'line', 'x0': 40, 'x1': 40, 'xref': 'x', 'y0': 0, 'y1': 1, 'yref': 'y domain'}],
               'template': '...'}
})

***** Epoch 0 


10it [01:12,  7.22s/it]


***** Epoch 1 


0it [00:00, ?it/s]

Step 0/10 | Train Loss: 0.9848301952535455 | Test Loss: 0.9924437999725342 | LR: 2e-05 | RAM: 0.2 GB



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.

1it [00:09,  9.82s/it]

Correct: 8/9
✅ [0.42] - Started searching for jobs and struggling
✅ [0.38] - Just received a new job offer with a big pay raise!
❌ [0.27] - My job is paying my coworker more than me. What do I do?
✅ [0.45] - Struggling to manage my credit card debt. I'm in $10k debt and they just keep piling up.
✅ [0.43] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.43] - Best budgeting app to use? Looking for a replacement for Mint.
✅ [0.49] - As a recent college graduate, I thought it wouldn't be that hard to find a job that paid $20 an hour that but I was wrong. There are no entry level jobs in my field or any job that could potentially get my foot in the door pay so little. What is the point of having a college degree if full time jobs won't even allow me to move out and 

10it [01:13,  7.33s/it]


***** Epoch 2 


0it [00:00, ?it/s]

Step 0/10 | Train Loss: 0.8880707962172372 | Test Loss: 0.9721834659576416 | LR: 2e-05 | RAM: 0.2 GB



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.

1it [00:09,  9.74s/it]

Correct: 7/9
✅ [0.42] - Started searching for jobs and struggling
✅ [0.55] - Just received a new job offer with a big pay raise!
❌ [0.2] - My job is paying my coworker more than me. What do I do?
✅ [0.47] - Struggling to manage my credit card debt. I'm in $10k debt and they just keep piling up.
❌ [0.38] - Emotionally exhausted from this BS market
I just need to rant. I never cried so much within a week until now. I have bills to pay, rent is over 2k for a studio, there are no jobs here. I am so tired because this job market is exhausting and everyone is saying its fine.
✅ [0.43] - Best budgeting app to use? Looking for a replacement for Mint.
✅ [0.56] - As a recent college graduate, I thought it wouldn't be that hard to find a job that paid $20 an hour that but I was wrong. There are no entry level jobs in my field or any job that could potentially get my foot in the door pay so little. What is the point of having a college degree if full time jobs won't even allow me to move out and r

10it [01:14,  7.43s/it]


***** Epoch 3 


0it [00:00, ?it/s]

Step 0/10 | Train Loss: 0.7730229391205695 | Test Loss: 0.9322282671928406 | LR: 2e-05 | RAM: 0.2 GB


0it [00:09, ?it/s]


KeyboardInterrupt: 

In [None]:
# from peft import LoraConfig, TaskType, get_peft_model

# lora_config = LoraConfig(
#     task_type = TaskType.SEQ_CLS, r = 4, lora_alpha = 1, lora_dropout = 0.1
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()
# print(model.roberta.encoder.layer[0].attention.self)