In [2]:
from models.hypformer import HypFormer
from manifolds.layer import Optimizer
from models.hubert import HUBERT, HUBERTForLM
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger
import torchmetrics.functional as FM
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import DataCollatorForLanguageModeling, AutoTokenizer
from datasets import Dataset, load_dataset
import lightning as L
from utils import get_loaders
import wandb
from transformers import AutoModelForMaskedLM, AutoConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Args:
    def __init__(self):
        self.k_in = 1.0
        self.k_out = 1.0
        self.decoder_type = 'euc'
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu' # If using HUBERTLightning, will be overwritten
        self.add_positional_encoding = True
        self.attention_type = 'full'
        self.power_k = 2
        self.trans_heads_concat = False
        self.optimizer_type = 'adam'
        self.hyp_optimizer_type = 'radam'
        self.weight_decay = 0.0
        self.hyp_weight_decay = 0.005
        self.lr = 1e-3
        self.hyp_lr = 1e-3

class Cfg:
    def __init__(self):
        self.save_path = '/n/netscratch/sham_lab/Everyone/cbrownpinilla/hyperfilter/models/hyperbolic_bert'
        self.tokenizer = 'allenai/eleuther-ai-gpt-neox-20b-pii-special'
        self.batch_size = 1
        self.model_max_len = 512
        self.hidden_dim = 768
        self.n_layers = 1
        self.n_heads = 1
        self.dropout = 0.1
        self.max_duration = 100
        self.validation_interval = 10

        self.make_tokenizer()
        self.vocab_size = len(self.tokenizer)
        self.padding_idx = 0

    def make_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, return_special_tokens_mask=True)
        tokenizer.model_max_length = self.model_max_len
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.add_special_tokens({'mask_token': '[MASK]'})
        self.tokenizer = tokenizer
        return 

In [4]:
class HUBERTLightning(L.LightningModule):
    """
    Wrapper around Hubert models for distributed training via Pytorch Lightning
    """
    def __init__(self, model_cfg, hyp_args):
        super().__init__()
        self.args = hyp_args

        self.hubert = HUBERT(self.args, model_cfg)
        self.model = HUBERTForLM(self.hubert)
        self.opts = Optimizer(self.model, self.args)
        self.automatic_optimization = False # We use 2 optimizers for euc/hyp params 
        self.criterion = nn.CrossEntropyLoss()

        self.save_hyperparameters(ignore=['model'])
        self.callbacks = [
            ModelCheckpoint(dirpath='/n/netscratch/sham_lab/Everyone/cbrownpinilla/models/hubert',
                            monitor='val_acc',
                            mode='max',
                            filename='hubert-{step}-{val_loss:.2f}',
                            every_n_train_steps=model_cfg.validation_interval,
                            save_last=True)
        ]
        self.total_throughput = 0 # Monitor how many tokens our model sees


    def iteration_step(self, batch):
        x, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        if self.training:
            self.total_throughput += batch['input_ids'].view(-1).shape[0] # process (bsz * seq_len) tokens
        logits = self.model(x, mask=mask).transpose(1, 2)
        preds = torch.argmax(logits, dim=1)
        acc = FM.accuracy(preds, 
                          labels, 
                          task='multiclass',
                          num_classes=self.model.hubert.vocab_size,
                          ignore_index=-100)
        loss = self.criterion(logits, labels)
        return loss, acc
    
    def training_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        opts = self.opts
        opts.zero_grad()
        self.manual_backward(loss)
        # log gradient norm
        grad_norm = torch.sqrt(
            sum(p.grad.pow(2).sum() for p in self.model.parameters() if p.grad is not None)
        )
        self.log('grad_norm', grad_norm)
        # log learning rate(s)
        for opt in self.opts.optimizer:
            for i, pg in enumerate(opt.param_groups):
                self.log(f'lr_group_{i}', pg['lr'])
        opts.step()

        self.log('train_loss', loss)
        self.log('train_acc', acc)
        self.log('throughput', self.total_throughput)

    def validation_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        self.log("val_loss", loss, sync_dist=True)
        self.log("val_acc", acc, sync_dist=True)

    def test_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        self.log("test_loss", loss)
        self.log("test_acc", acc)
        
    
    def configure_optimizers(self):
        return self.opts.optimizer

In [5]:
class BERTLightning(L.LightningModule):
    """
    Wrapper around Hubert models for distributed training via Pytorch Lightning
    """
    def __init__(self, model, model_cfg, hyp_args):
        super().__init__()
        self.args = hyp_args
        self.cfg = model_cfg

        self.model = model
        self.opts = Optimizer(self.model, self.args)
        self.automatic_optimization = False # We use 2 optimizers for euc/hyp params 
        self.criterion = nn.CrossEntropyLoss()

        self.save_hyperparameters(ignore=['model'])
        self.callbacks = [
            ModelCheckpoint(dirpath='/n/netscratch/sham_lab/Everyone/cbrownpinilla/models/hubert',
                            monitor='val_acc',
                            mode='max',
                            filename='hubert-{step}-{val_loss:.2f}',
                            every_n_train_steps=model_cfg.validation_interval,
                            save_last=True)
        ]
        self.total_throughput = 0 # Monitor how many tokens our model sees


    def iteration_step(self, batch):
        x, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        btch = {'input_ids': x, 'attention_mask': mask, 'labels': labels}
        if self.training:
            self.total_throughput += batch['input_ids'].view(-1).shape[0] # process (bsz * seq_len) tokens
        logits = self.model(**btch).logits.transpose(1, 2)
        preds = torch.argmax(logits, dim=1)
        acc = FM.accuracy(preds, 
                          labels, 
                          task='multiclass',
                          num_classes=self.cfg.vocab_size,
                          ignore_index=-100)
        loss = self.criterion(logits, labels)
        return loss, acc
    
    def training_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        opts = self.opts
        opts.zero_grad()
        self.manual_backward(loss)
        # log gradient norm
        grad_norm = torch.sqrt(
            sum(p.grad.pow(2).sum() for p in self.model.parameters() if p.grad is not None)
        )
        self.log('grad_norm', grad_norm)
        # log learning rate(s)
        for opt in self.opts.optimizer:
            for i, pg in enumerate(opt.param_groups):
                self.log(f'lr_group_{i}', pg['lr'])
        opts.step()

        self.log('train_loss', loss)
        self.log('train_acc', acc)
        self.log('throughput', self.total_throughput)

    def validation_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        self.log("val_loss", loss, sync_dist=True)
        self.log("val_acc", acc, sync_dist=True)

    def test_step(self, batch, batch_idx):
        loss, acc = self.iteration_step(batch)

        self.log("test_loss", loss)
        self.log("test_acc", acc)
        
    
    def configure_optimizers(self):
        return self.opts.optimizer

In [6]:
def get_loaders(cfg):
    train, test, val = _get_loaders('train', cfg), _get_loaders('test', cfg), _get_loaders('validation', cfg)
    collator = DataCollatorForLanguageModeling(tokenizer=cfg.tokenizer)
    train_loader = DataLoader(train, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1)
    test_loader = DataLoader(test, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1)
    val_loader = DataLoader(val, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1)
    return train_loader, test_loader, val_loader

def _get_loaders(split: str, cfg):
    assert split in ['train', 'test', 'validation'], "Choose one of 'train', 'test', 'validation'"
    key = f'-{split}' if split in ['test', 'validation'] else ''
    ds = load_dataset('m-a-p/FineFineWeb'+key, streaming=True, split='train').remove_columns(['language_score', 'date', 'url', 'file_path', 'dump', 'global_id', 'lang', 'domain', 'token_count'])
    ds = ds.with_format(type='torch')
    ds = ds.map(lambda ex: cfg.tokenizer(ex['text'], 
                                                   truncation=True, 
                                                   padding=True,
                                                   add_special_tokens=True), 
                            batched=True, remove_columns=['text'])
    return ds

def get_toy_loaders(cfg):
    train, test, val = _get_toy_loaders('train', cfg), _get_toy_loaders('test', cfg), _get_toy_loaders('validation', cfg)
    collator = DataCollatorForLanguageModeling(tokenizer=cfg.tokenizer)
    train_loader = DataLoader(train, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1, shuffle=False)
    test_loader = DataLoader(test, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1, shuffle=False)
    val_loader = DataLoader(val, batch_size=cfg.batch_size, collate_fn=collator, num_workers=1, shuffle=False)
    return train_loader, test_loader, val_loader

def _get_toy_loaders(split: str, cfg):
    assert split in ['train', 'test', 'validation'], "Choose one of 'train', 'test', 'validation'"
    key = f'-{split}' if split in ['test', 'validation'] else ''
    slicemap = {'train': 320, 'test': 32, 'validation': 32}
    ds = load_dataset('m-a-p/FineFineWeb'+key, streaming=True, split='train').remove_columns(['language_score', 'date', 'url', 'file_path', 'dump', 'global_id', 'lang', 'domain', 'token_count']).take(slicemap[split])
    ds = ds.with_format(type='torch')
    ds = ds.map(lambda ex: cfg.tokenizer(ex['text'], 
                                                   truncation=True, 
                                                   padding=True,
                                                   add_special_tokens=True), 
                            batched=True, remove_columns=['text'])
    return ds

In [7]:
hyp_args, model_config = Args(), Cfg()
train_loader, test_loader, val_loader = get_loaders(model_config)
model = HUBERTLightning(model_config, hyp_args)

>> Total trainable parameters: 120039532
>> Total non-embedding Euclidean parameters: 42807916
>> Number of Hyperbolic parameters: 0


In [None]:
wandb.login()
wandb_logger = WandbLogger(project='HUBERT', log_model='all')

hyp_args, model_config = Args(), Cfg()
train_loader, test_loader, val_loader = get_loaders(model_config)
model = HUBERTLightning(model_config, hyp_args)

trainer = L.Trainer(
    devices='auto',
    num_nodes=1,
    strategy='auto',
    max_steps=model_config.max_duration,
    val_check_interval=model_config.validation_interval,
    logger=wandb_logger,
    default_root_dir=model_config.save_path,
    callbacks=model.callbacks,
)
wandb_logger.watch(model)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

## Test that it is only model arch. failing by training BERT with everyting else equal

In [7]:
class Cfg:
    def __init__(self):
        self.save_path = '/n/netscratch/sham_lab/Everyone/cbrownpinilla/hyperfilter/models/hyperbolic_bert'
        self.tokenizer = 'distilbert/distilroberta-base'
        self.batch_size = 1
        self.model_max_len = 512
        self.hidden_dim = 768
        self.n_layers = 1
        self.n_heads = 1
        self.dropout = 0.1
        self.max_duration = 100
        self.validation_interval = 10

        self.make_tokenizer()
        self.vocab_size = len(self.tokenizer)
        self.padding_idx = 0

    def make_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, return_special_tokens_mask=True)
        tokenizer.model_max_length = self.model_max_len
        tokenizer.pad_token = tokenizer.eos_token
        # tokenizer.add_special_tokens({'mask_token': '[MASK]'})
        self.tokenizer = tokenizer
        return 

In [8]:
model_config, hyp_args = Cfg(), Args()
train_loader, test_loader, val_loader = get_toy_loaders(model_config)
bert_config = AutoConfig.from_pretrained('distilbert/distilroberta-base')
bert = AutoModelForMaskedLM.from_config(bert_config)

In [9]:
model = BERTLightning(bert, model_config, hyp_args)
wandb.login()
wandb_logger = WandbLogger(project='HUBERT', log_model='all', name='roberta-overfit', entity='harvardml')

trainer = L.Trainer(
    devices='auto',
    num_nodes=1,
    strategy='auto',
    max_steps=model_config.max_duration,
    val_check_interval=model_config.validation_interval,
    logger=wandb_logger,
    default_root_dir=model_config.save_path,
    callbacks=model.callbacks,
)
wandb_logger.watch(model)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


>> Total trainable parameters: 121170777
>> Total non-embedding Euclidean parameters: 43169625
>> Number of Hyperbolic parameters: 0


[34m[1mwandb[0m: Currently logged in as: [33mcbrownpinilla[0m ([33mcolor-exps[0m). Use [1m`wandb login --relogin`[0m to force relogin
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/ ...
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Currently logged in as: [33mcbrownpinilla[0m ([33mharvardml[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/ ...

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | RobertaForMaskedLM | 82.2 M | train
1 | criterion | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
82.2 M    Trainable params
0         Non-trainable params
82.2 M    Total params
328.681   Total estimated model params size (MB)
123       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})


Epoch 0: |          | 313/? [24:06<00:00,  0.22it/s, v_num=sn2q]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

## Ensure HypFormer works

In [3]:
torch.manual_seed(69)
_N, _D = 50, 100
_V = 5

x = torch.randn(_N, _D)

class Args:
    def __init__(self):
        self.k_in = 1.0
        self.k_out = 1.0
        self.decoder_type = 'hyp'
        self.device = 'cpu'
        self.add_positional_encoding = True
        self.attention_type = 'full'
        self.power_k = 2
        self.trans_heads_concat = False
        self.optimizer_type = 'adam'
        self.hyp_optimizer_type = 'radam'
        self.weight_decay = 0.0
        self.hyp_weight_decay = 0.005
        self.lr = 0.01
        self.hyp_lr = 0.01
        self.batch_size = 32

args = Args()

# model = HypFormer(
#     in_channels=768,
#     hidden_channels=768,
#     out_channels=30000,
#     trans_num_layers=12,
#     trans_num_heads=12,
#     trans_dropout=0.1,
#     trans_use_bn=True,
#     trans_use_residual=True,
#     trans_use_act=True,
#     args=args
# )

# model(ex) 

## Make a hyperbolic BERT from HypFormer

In [4]:
# NSP and MLM heads to train BERT
class NSPHead(nn.Module):
    def __init__(self, hidden):
        
        # NSP projects logits to R^2 to choose 1 of 2 classes
        super().__init__()
        self.linear = nn.Linear(hidden, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.sofmax(self.linear(x[:,0]))
    
class MLMHead(nn.Module):
    def __init__(self, model):

        # MLM projects logits to R^{vocab} to choose token from voacab
        super().__init__()
        self.linear = nn.Linear(model.hidden_dim, model.vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

In [5]:
class HUBERT(nn.Module):
    def __init__(self, vocab_size, args, hidden_dim=768, n_layers=12, attn_heads=12, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.layers = HypFormer(
            in_channels=hidden_dim,
            hidden_channels=hidden_dim,
            out_channels=hidden_dim,
            trans_num_layers=n_layers,
            trans_num_heads=attn_heads,
            trans_dropout=dropout,
            args=args
        )

    def forward(self, x, mask=None):
        if mask is not None:
            # If no mask provided, assumes pad token is 0 and masks pad tokens
            mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        x = self.embedding(x) #[bsz, seq_len, hidden_dim]
        return self.layers(x, mask=mask)

        
class HUBERTForLM(nn.Module):
    """
    Hyperbolic BERT with Masked Language Modeling head
    """
    def __init__(self, hubert: HUBERT):
        super().__init__()
        self.hubert = hubert
        self.mlm = MLMHead(hubert)

    def forward(self, x, mask=None):
        x = self.hubert(x, mask=mask)
        return self.mlm(x)

In [117]:
cfg.tokenizer.cls_token

In [47]:
# We add special tokens, so must use len(tokenizer) and not tokenizer.vocab_size
hubie = HUBERT(len(cfg.tokenizer), args, n_layers=2, attn_heads=2)
biggerhubie = HUBERTForLM(hubie)

## Make HUBERT loader

In [7]:
# For right now might be hardcoded for finefineweb, but might fix later?
def get_loaders(hf_link: str, cfg):
    dataset = load_dataset(hf_link, streaming=True, split='train').remove_columns(['language_score', 'date', 'url', 'file_path', 'dump', 'global_id', 'lang', 'domain', 'token_count'])
    dataset = dataset.with_format(type='torch')
    dataset = dataset.map(lambda ex: cfg.tokenizer(ex['text'], 
                                                   truncation=True, 
                                                   padding=True,
                                                   add_special_tokens=True), 
                            batched=True, remove_columns=['text'])
    collator = DataCollatorForLanguageModeling(tokenizer=cfg.tokenizer)
    return DataLoader(dataset, batch_size=cfg.batch_size, collate_fn=collator)

In [37]:
class Cfg:
    def __init__(self):
        self.tokenizer = 'allenai/eleuther-ai-gpt-neox-20b-pii-special'
        self.batch_size = 32
        self.model_max_len = 512

        self.make_tokenizer()

    def make_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, return_special_tokens_mask=True)
        tokenizer.model_max_length = self.model_max_len
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.add_special_tokens({'mask_token': '[MASK]'})
        self.tokenizer = tokenizer
        return 
    
cfg = Cfg()

loader = get_loaders('m-a-p/FineFineWeb-sample', cfg)

## Pretain HUBERT

In [49]:
class LightningWrapper(L.LightningModule):
    """
    Wrapper around moodels for distributed training via PytorchLightning
    """
    def __init__(self, model, hyp_args):
        super().__init__()
        # We use 2 optimizers for euc/hyp params 
        self.automatic_optimization = False
        self.model = model
        self.args = hyp_args
        self.criterion = nn.CrossEntropyLoss()
        self.opts = Optimizer(model, hyp_args)

    def training_step(self, batch, batch_idx):
        opts = self.opts
        x, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        pred = self.model(x, mask=mask).transpose(1, 2)
        loss = self.criterion(pred, labels)
        opts.zero_grad()
        self.manual_backward(loss)
        opts.step()

        self.log_dict({'loss': loss}, prog_bar=True)
    
    def configure_optimizers(self):
        return self.opts.optimizer

In [50]:
# Need to figure out how to use custom optimizers with lightning to allow for
# euc and hyp optimization

wrapped_model = LightningWrapper(biggerhubie, args)
trainer = L.Trainer()
trainer.fit(wrapped_model, loader)

/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/ ...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.


>> Number of Euclidean parameters: 87930223
>> Number of Hyperbolic parameters: 590592


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | HUBERTForLM      | 88.5 M | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
88.5 M    Trainable params
3         Non-trainable params
88.5 M    Total params
354.083   Total estimated model params size (MB)
67        Modules in train mode
0         Modules in eval mode
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottle

Epoch 0: |          | 1/? [00:55<00:00,  0.02it/s, v_num=9, loss=427.0]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [44]:
toy_model = HUBERT(5000, args=args, hidden_dim=64, n_layers=1, attn_heads=2)
toy_model_lm = HUBERTForLM(toy_model)
toy_data = torch.randint(0, 5000, (64, 512))

print(f'transformed shape: {toy_model(toy_data).shape}')
print(f'lm shape: {toy_model_lm(toy_data).shape}')

torch.sum(toy_model_lm(toy_data)[0, 0, 0])

transformed shape: torch.Size([64, 512, 64])
lm shape: torch.Size([64, 512, 5000])


tensor(-100.8041, grad_fn=<SumBackward0>)

# Test new additions

In [None]:
import lightning as L

from models.hubert import REVISEDHUBERTLightning, HFLightning
from configs.config import BaseConfig
from data.dataloaders import build_train_dataloader, build_eval_dataloader

base_yaml = '/n/netscratch/sham_lab/Everyone/cbrownpinilla/hyperfilter/hyperbolic-transformer/Hypformer/hubert/configs/base.yaml'
overwrite = '/n/netscratch/sham_lab/Everyone/cbrownpinilla/hyperfilter/hyperbolic-transformer/Hypformer/hubert/configs/train_roberta.yaml'
base_config = BaseConfig(base_yaml, overwrite)
if base_config.which_model == 'hubert':
    model = REVISEDHUBERTLightning(base_config)
else:
    model = HFLightning(base_config)

train_loader = build_train_dataloader(base_config)
eval_loader = build_eval_dataloader(base_config)

trainer = L.Trainer(
    devices='auto',
    max_steps=base_config.max_duration,
    val_check_interval=base_config.validation_interval,
    default_root_dir=base_config.save_folder,
    callbacks=model.callbacks,
    log_every_n_steps=base_config.log_interval
)

trainer.fit(model, train_loader, eval_loader)


  from .autonotebook import tqdm as notebook_tqdm
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


>> Total trainable parameters: 121195369
>> Total non-embedding Euclidean parameters: 43169641
>> Number of Hyperbolic parameters: 0
Constructing optimizer with 106 parameter groups...
Loaded data from: pretrain
Saving global data order indices...


/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/ ...
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Global data order indices saved to '%s' ../ckpts/roBERTa-Pretrain/train_data/global_indices.npy
Loaded data from: eval


/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /n/netscratch/sham_lab/Everyone/cbrownpinilla/hyperfilter/hyperbolic-transformer/Hypformer/ckpts/roBERTa-Pretrain exists and is not empty.

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | RobertaForMaskedLM | 82.2 M | train
1 | criterion | CrossEntropyLoss   | 0      | train
---------------------------------------------------------
82.2 M    Trainable params
0         Non-trainable params
82.2 M    Total params
328.730   Total estimated model params size (MB)
123       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:25<00:25,  0.04it/s]

/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 32. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0: |          | 0/? [00:00<?, ?it/s]                                 