In [2]:
import logging
import math
import os
import sys
import warnings
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional

import datasets
from datasets import load_dataset
from datasets import load_from_disk

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForPermutationLanguageModeling,
    DataCollatorWithPadding,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    XLNetConfig,
    XLNetLMHeadModel,
    set_seed,
)



from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch.utils.data import DataLoader

## Upstream Dataloading

In [7]:
dataset = load_from_disk("bookcorpus_train.hf")
dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2687711
})

In [8]:
type(dataset)

datasets.arrow_dataset.Dataset

In [16]:
DataLoader

torch.utils.data.dataloader.DataLoader

In [3]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [20]:
collator = DataCollatorForPermutationLanguageModeling(
    tokenizer=tokenizer,
    plm_probability=1/6,
    max_span_length=5)

In [21]:
DataLoader(dataset.with_format("torch"), collate_fn=collator, batch_size=8)

<torch.utils.data.dataloader.DataLoader at 0x7f98e5e43450>

In [18]:
model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")

In [14]:
dict(XLNetLMHeadModel.from_pretrained("xlnet-base-cased").named_modules())['lm_loss']

Linear(in_features=768, out_features=32000, bias=True)

In [29]:
import torch
isinstance(model.base_model, torch.nn.Module)

True

## Correcting misspecified encoder

When I wrote the initial code for the pre-training of the model, I messed up and specified the entire XLNetLMHeadModel as the encoder, even though we just wanted the base model. This code is one-off; just adjusts the object so that it's correct!

In [37]:
# Run this to patch up inconsistency between 
from BayeTrans import priorBox
import sys

sys.modules['priorBox'] = priorBox

In [2]:
new_swag = torch.load("./231201_130955/swag_model1.pt")

NameError: name 'torch' is not defined

In [1]:
new_swag.base_model = new_swag.base_model.base_model

NameError: name 'new_swag' is not defined

In [44]:
torch.save(new_swag, "./231201_130955/swag_model_saved.pt")

## Scrap for Downstream Dataloader

Now, I need to rewrite the `prepare_data` method from solo_learn. Honestly all of solo_learn is so annoyingly written, and carries so much overhead that it's not worth dealing with. This is essentially a from-scratch rewrite that utilizes Huggingface to make things a lot easier for us.

In [3]:
dataset = load_dataset("ag_news")

Downloading builder script: 100%|██████████| 4.06k/4.06k [00:00<00:00, 4.06MB/s]
Downloading metadata: 100%|██████████| 2.65k/2.65k [00:00<?, ?B/s]
Downloading readme: 100%|██████████| 7.95k/7.95k [00:00<?, ?B/s]
Downloading data: 29.5MB [00:00, 36.4MB/s]                            
Downloading data: 1.86MB [00:00, 13.8MB/s]                  
Generating train split: 100%|██████████| 120000/120000 [00:03<00:00, 36353.54 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<00:00, 33370.82 examples/s]


In [4]:
dataset.save_to_disk("./data/ag_news")

Saving the dataset (1/1 shards): 100%|██████████| 120000/120000 [00:00<00:00, 2392825.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7600/7600 [00:00<00:00, 1224520.22 examples/s]


In [6]:
dataset = load_from_disk("./data/ag_news")
split_train_dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['train'] = split_train_dataset['train']
dataset['eval'] = split_train_dataset['test']

In [84]:
dataset['train'][0]

{'text': 'Fossil of Great Ape Ancestor Found A new fossil found in Spain may be the closest yet to the common ancestor of all great apes, perhaps including humans, researchers say.',
 'label': 3}

In [85]:
tokenizer.model_max_length

1000000000000000019884624838656

In [7]:
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_agnews = dataset.map(preprocess_function, remove_columns=['text'], batched=True)
print("GOT HERE!")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenized_agnews['train'], collate_fn=data_collator, batch_size=3)
val_dataloader = DataLoader(tokenized_agnews['eval'], collate_fn=data_collator, batch_size=3)

Map:   0%|          | 0/108000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 108000/108000 [00:10<00:00, 9912.61 examples/s] 
Map: 100%|██████████| 12000/12000 [00:01<00:00, 9839.18 examples/s] 

GOT HERE!





In [94]:
tokenized_agnews['train'][0].keys()

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [95]:
next(iter(train_dataloader)).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

## Understanding the Posterior Parameter Torch Objects

These dumbasses didn't specify what the mean.pt, variance.pt, or covmat.pt mean. Are these just tensors?
- mean.pt is just a tensor of means
- covmat.pt is just a tensor of nondiags; happens to be quite big!
- the model is obviously just the model and its weights
- variance.pt is the diagonal

In [6]:
mean = torch.load("./pretrain-res/resnet101_torchvision_mean.pt")

In [4]:
cov = torch.load("./pretrain-res/resnet101_torchvision_covmat.pt")

In [5]:
cov.shape

torch.Size([7, 42500160])

In [7]:
mean.shape

torch.Size([42500160])

## Connecting the dots: How to extract posterior parameter from our saved pre-train model

Now that we know that the mean, cov, and variance are just tensors, how can we pull these out of our model?

In [13]:
# Run this to patch up inconsistency between 
from BayeTrans import priorBox
import sys

sys.modules['priorBox'] = priorBox

new_swag = torch.load("./swag_model_saved.pt")

KeyboardInterrupt: 

## Attempting to use my own PC for training

In [116]:
torch.cuda.empty_cache()

In [8]:
device = torch.device('cuda')
batch = next(iter(train_dataloader)).to(device)
batch

You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    5,     5,     5,     5,  1534,    19,   768,   521, 11239,  2679,
            22,  7650,  5230,    31,   276,    32,   561,   933,    19,   768,
            19,  2714,    21,   298,    86,   452,   282,   339,  2831,  3522,
          2679,    22,  7650, 15872,  1604,    70,   100,    17, 17666,  4165,
          1464,   173,    24,   119,    31,  6670,    40,    18,   187,     4,
             3],
        [    5,     5,     5,     5,     5,     5, 31452, 23510,    23,  6589,
          1134,   359,   102, 17551,  1069,    64,    39,   102,  8691,   697,
            31,    81,  1108,    27,    24,  8152,     9,   130,  2163,    18,
           158,   313,   510, 17917,    64,    39,  3270,    24,  1081,    37,
         19170, 31452, 29832,   490,    97,    23,  2059,  3883,     9,     4,
             3],
        [ 2491,   947,   109,   987,  2012, 12977,  2491,    47,   618,    18,
           281,   164,    24, 14489,    20,    58,  2827,    28,   106,   399,
    

In [11]:

outputs = new_swag.base_model.forward(input_ids=batch['input_ids'], 
                                       token_type_ids=batch['token_type_ids'], 
                                       attention_mask=batch['attention_mask'])[0]

In [15]:
outputs[:, -1].shape

torch.Size([3, 768])

In [114]:
outputs.shape

torch.Size([3, 50, 768])

In [5]:
torch.cuda.is_available()

True

In [4]:
from BayeTrans.priorBox.sghmc.utils import run_and_log_bma
from BayeTrans.priorBox.sghmc.sghmc_model import SGLDModel

2023-12-09 13:28:38,645	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-12-09 13:28:38,849	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [5]:
checkpoint = torch.load("./epoch=2-step=20249.ckpt")
# new_model = SGLDModel()

In [6]:
checkpoint.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'callbacks', 'optimizer_states', 'lr_schedulers'])

In [9]:
my_state = torch.load("./epoch=2-step=20249.ckpt")['state_dict']

OrderedDict([('backbone.mask_emb',
              tensor([[[-3.8372e-03,  5.2487e-03, -6.4034e-03, -1.9146e-03,  5.0661e-03,
                        -5.7861e-03, -7.6627e-03,  8.1751e-03,  4.7334e-03, -4.4069e-03,
                        -6.8051e-03,  9.4366e-03, -1.0112e-03, -7.6539e-03, -2.2896e-03,
                         1.0482e-04,  1.5159e-02, -5.7098e-03,  2.7168e-03,  6.4857e-03,
                         6.3692e-03, -6.1706e-03, -7.1384e-03, -3.8927e-02,  1.7723e-02,
                         9.3109e-03,  2.5638e-03,  4.6703e-03,  5.6972e-03, -9.3661e-04,
                         7.5768e-04,  8.9215e-03, -9.5132e-04, -7.0269e-03,  2.4826e-03,
                        -8.3182e-03,  1.6540e-03, -5.0280e-03, -3.8653e-03,  3.8405e-03,
                         2.1969e-02, -3.3092e-03,  5.1259e-03, -4.8100e-03, -1.4904e-02,
                        -4.6103e-03, -1.0634e-03,  4.6946e-04, -8.2055e-03,  7.8151e-04,
                         6.4620e-03, -1.4167e-02,  5.0862e-04,  1.8549e-03,

In [7]:
with open("./logs/231208/bayes-transfer-nlp/trail/checkpoints/trail/checkpoints/epoch=1-step=1.ckpt", "r") as f:
    print("hello")


FileNotFoundError: [Errno 2] No such file or directory: './logs/231208/bayes-transfer-nlp/trail/checkpoints/trail/checkpoints/epoch=1-step=1.ckpt'

In [None]:
C:\Users\alanj\projects\bayes-nlp-transfer\logs\231208\bayes-transfer-nlp
C:\Users\alanj\projects\bayes-nlp-transfer\logs\231208\bayes-transfer-nlp\trail\checkpoints\epoch=2-step=20249.ckpt