In [2]:
from hypformer import HypFormer
from manifolds.layer import Optimizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorForLanguageModeling, AutoTokenizer
from datasets import Dataset, load_dataset
import lightning as L

  from .autonotebook import tqdm as notebook_tqdm


## Ensure HypFormer works

In [3]:
torch.manual_seed(69)
_N, _D = 50, 100
_V = 5

x = torch.randn(_N, _D)

class Args:
    def __init__(self):
        self.k_in = 1.0
        self.k_out = 1.0
        self.decoder_type = 'hyp'
        self.device = 'cpu'
        self.add_positional_encoding = True
        self.attention_type = 'full'
        self.power_k = 2
        self.trans_heads_concat = False
        self.optimizer_type = 'adam'
        self.hyp_optimizer_type = 'radam'
        self.weight_decay = 0.0
        self.hyp_weight_decay = 0.005
        self.lr = 0.01
        self.hyp_lr = 0.01
        self.batch_size = 32

args = Args()

# model = HypFormer(
#     in_channels=768,
#     hidden_channels=768,
#     out_channels=30000,
#     trans_num_layers=12,
#     trans_num_heads=12,
#     trans_dropout=0.1,
#     trans_use_bn=True,
#     trans_use_residual=True,
#     trans_use_act=True,
#     args=args
# )

# model(ex) 

## Make a hyperbolic BERT from HypFormer

In [4]:
# NSP and MLM heads to train BERT
class NSPHead(nn.Module):
    def __init__(self, hidden):
        
        # NSP projects logits to R^2 to choose 1 of 2 classes
        super().__init__()
        self.linear = nn.Linear(hidden, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.sofmax(self.linear(x[:,0]))
    
class MLMHead(nn.Module):
    def __init__(self, model):

        # MLM projects logits to R^{vocab} to choose token from voacab
        super().__init__()
        self.linear = nn.Linear(model.hidden_dim, model.vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        return self.softmax(self.linear(x))

In [5]:
class HUBERT(nn.Module):
    def __init__(self, vocab_size, args, hidden_dim=768, n_layers=12, attn_heads=12, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.layers = HypFormer(
            in_channels=hidden_dim,
            hidden_channels=hidden_dim,
            out_channels=hidden_dim,
            trans_num_layers=n_layers,
            trans_num_heads=attn_heads,
            trans_dropout=dropout,
            args=args
        )

    def forward(self, x, mask=None):
        if mask is not None:
            # If no mask provided, assumes pad token is 0 and masks pad tokens
            mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
        x = self.embedding(x) #[bsz, seq_len, hidden_dim]
        return self.layers(x, mask=mask)

        
class HUBERTForLM(nn.Module):
    """
    Hyperbolic BERT with Masked Language Modeling head
    """
    def __init__(self, hubert: HUBERT):
        super().__init__()
        self.hubert = hubert
        self.mlm = MLMHead(hubert)

    def forward(self, x, mask=None):
        x = self.hubert(x, mask=mask)
        return self.mlm(x)

In [117]:
cfg.tokenizer.cls_token

In [47]:
# We add special tokens, so must use len(tokenizer) and not tokenizer.vocab_size
hubie = HUBERT(len(cfg.tokenizer), args, n_layers=2, attn_heads=2)
biggerhubie = HUBERTForLM(hubie)

## Make HUBERT loader

In [7]:
# For right now might be hardcoded for finefineweb, but might fix later?
def get_loaders(hf_link: str, cfg):
    dataset = load_dataset(hf_link, streaming=True, split='train').remove_columns(['language_score', 'date', 'url', 'file_path', 'dump', 'global_id', 'lang', 'domain', 'token_count'])
    dataset = dataset.with_format(type='torch')
    dataset = dataset.map(lambda ex: cfg.tokenizer(ex['text'], 
                                                   truncation=True, 
                                                   padding=True,
                                                   add_special_tokens=True), 
                            batched=True, remove_columns=['text'])
    collator = DataCollatorForLanguageModeling(tokenizer=cfg.tokenizer)
    return DataLoader(dataset, batch_size=cfg.batch_size, collate_fn=collator)

In [37]:
class Cfg:
    def __init__(self):
        self.tokenizer = 'allenai/eleuther-ai-gpt-neox-20b-pii-special'
        self.batch_size = 32
        self.model_max_len = 512

        self.make_tokenizer()

    def make_tokenizer(self):
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, return_special_tokens_mask=True)
        tokenizer.model_max_length = self.model_max_len
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.add_special_tokens({'mask_token': '[MASK]'})
        self.tokenizer = tokenizer
        return 
    
cfg = Cfg()

loader = get_loaders('m-a-p/FineFineWeb-sample', cfg)

## Pretain HUBERT

In [49]:
class LightningWrapper(L.LightningModule):
    """
    Wrapper around moodels for distributed training via PytorchLightning
    """
    def __init__(self, model, hyp_args):
        super().__init__()
        # We use 2 optimizers for euc/hyp params 
        self.automatic_optimization = False
        self.model = model
        self.args = hyp_args
        self.criterion = nn.CrossEntropyLoss()
        self.opts = Optimizer(model, hyp_args)

    def training_step(self, batch, batch_idx):
        opts = self.opts
        x, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        pred = self.model(x, mask=mask).transpose(1, 2)
        loss = self.criterion(pred, labels)
        opts.zero_grad()
        self.manual_backward(loss)
        opts.step()

        self.log_dict({'loss': loss}, prog_bar=True)
    
    def configure_optimizers(self):
        return self.opts.optimizer

In [50]:
# Need to figure out how to use custom optimizers with lightning to allow for
# euc and hyp optimization

wrapped_model = LightningWrapper(biggerhubie, args)
trainer = L.Trainer()
trainer.fit(wrapped_model, loader)

/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/ ...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.


>> Number of Euclidean parameters: 87930223
>> Number of Hyperbolic parameters: 590592


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | HUBERTForLM      | 88.5 M | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
88.5 M    Trainable params
3         Non-trainable params
88.5 M    Total params
354.083   Total estimated model params size (MB)
67        Modules in train mode
0         Modules in eval mode
/n/home11/cbrownpinilla/.conda/envs/hyperfilter/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottle

Epoch 0: |          | 1/? [00:55<00:00,  0.02it/s, v_num=9, loss=427.0]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [44]:
toy_model = HUBERT(5000, args=args, hidden_dim=64, n_layers=1, attn_heads=2)
toy_model_lm = HUBERTForLM(toy_model)
toy_data = torch.randint(0, 5000, (64, 512))

print(f'transformed shape: {toy_model(toy_data).shape}')
print(f'lm shape: {toy_model_lm(toy_data).shape}')

torch.sum(toy_model_lm(toy_data)[0, 0, 0])

transformed shape: torch.Size([64, 512, 64])
lm shape: torch.Size([64, 512, 5000])


tensor(-100.8041, grad_fn=<SumBackward0>)