In [1]:
import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig, AutoModel
from transformers.models.bloom.modeling_bloom import BloomBlock, build_alibi_tensor
from transformers import BloomTokenizerFast, BloomForCausalLM
from transformers import DataCollatorForLanguageModeling

from datasets import load_dataset
from torch.utils.data import DataLoader

In [4]:
%%time
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

Downloading:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

CPU times: user 1.02 s, sys: 130 ms, total: 1.15 s
Wall time: 7.11 s


In [6]:
%%time
model = AutoModel.from_pretrained("bigscience/bloom-560m")

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

CPU times: user 33.3 s, sys: 8.4 s, total: 41.7 s
Wall time: 2min 5s


In [29]:
dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
shuffled_dataset = dataset.shuffle(seed=42, buffer_size=10_000)

In [30]:
for key, value in next(iter(dataset)).items():
    print(key, value)

id 0
text Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi. Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help.
Established in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family. Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family. The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs.
Mtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters. This family environment is one that many of the children have never previously exp

In [31]:
batch = list(dataset.take(2))

In [32]:
batch[0]

{'id': 0,
 'text': 'Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi. Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help.\nEstablished in honor of John & Lindy’s son, Christopher Blanchard, this particular program is very dear to the Blanchard family. Dana Blanchard, or Mama Dana as she is more commonly referred to at Mtendere, lived on site during the initial development, and she returns each summer to spend the season with her Malawian family. The heart of the program is to be His hands and feet by caring for the children at Mtendere, and meeting their spiritual, physical, academic, and emotional needs.\nMtendere Village is home to 134 children, living in 16 homes with a housemother and several brothers and sisters. This family environment is one that many of the children have never pr

In [49]:
texts = [x['text'] for x in batch]
tokenized_texts = [tokenizer.tokenize(text) for text in texts]

In [58]:
len(tokenizer(texts)['input_ids'])

2

In [60]:
tokenizer(texts).keys()

dict_keys(['input_ids', 'attention_mask'])

In [61]:
tokenizer(texts)['attention_mask']

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [69]:
inputs = tokenizer(texts, padding=True, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [65]:
for key, value in inputs.items():
    print(key, value.shape)

input_ids torch.Size([2, 930])
attention_mask torch.Size([2, 930])


In [75]:
input_ids.shape[1]

930

In [76]:
alibi = build_alibi_tensor(attention_mask, model.num_heads, torch.float16)

In [79]:
model.config

BloomConfig {
  "_name_or_path": "bigscience/bloom-560m",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomModel"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "transformers_version": "4.23.1",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

In [82]:
ppmodel(**inputs).keys()

odict_keys(['last_hidden_state', 'past_key_values'])

In [2]:
%%time
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")

CPU times: user 4.79 s, sys: 1.26 s, total: 6.05 s
Wall time: 6.63 s


In [14]:
def collate_fn(data):
    texts = [x['text'] for x in data]
    inputs = tokenizer(texts, padding=True, return_tensors='pt')
    inputs['labels'] = torch.where(inputs['input_ids'] == 3, -100, inputs['input_ids'])
    return inputs


dataset = load_dataset('oscar', "unshuffled_deduplicated_ru", split='train', streaming=True)
dataset = dataset.shuffle(seed=42, buffer_size=100)
# dataset = dataset.with_format("torch")
# dataloader = DataLoader(dataset, batch_size=2) #collate_fn=collate_fn)

In [15]:
for i, data in enumerate(dataset):
    print(i, data)
    break


KeyboardInterrupt



In [11]:
batch = next(iter(dataloader))
batch


KeyboardInterrupt



In [None]:
outputs = model(**batch)

In [None]:
loss = outputs.loss
logits = outputs.logits
perplexity = torch.exp(loss)

In [31]:
print(f'Loss = {loss.item()}')
print(f'Perplexity = {perplexity.item()}')

NameError: name 'loss' is not defined

In [14]:
print(logits.shape)

torch.Size([2, 930, 250880])


In [28]:
from torch.utils.data import DataLoader

In [29]:
inputs['input_ids']

tensor([[130375,    563,    708,  ...,  33944,  74238,     17],
        [     3,      3,      3,  ...,     82,   3356,     17]])

In [32]:
for idx in input_ids[1].tolist():
    print(idx, tokenizer.decode(idx))
    break

3 <pad>


In [None]:
tokenizer()

In [26]:
seed, buffer_size = 42, 10_000
dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
dataset = dataset.shuffle(seed, buffer_size=buffer_size)
dataset = dataset.with_format('torch')

num_epochs = 3
for epoch in range(num_epochs):
    pass

In [27]:
next(iter(dataset))

{'id': 892,
 'text': 'In this role, she oversees the day-to-day operations of the agency’s motoring services divisions (Vehicle Titles & Registration, Motor Vehicles, Motor Carrier, Enforcement, Consumer Relations and the Automobile Burglary & Theft Prevention Authority) to ensure they are constantly improving and identifying opportunities to become more efficient and effective in service delivery.\nMellott came to the TxDMV from Alaska’s Division of Motor Vehicles where she most recently served as deputy executive director and acting executive director where she led a major initiative to modernize and improve the customer service experience. Previous positions at the Alaska DMV include oversight of all large field offices and leading the driver licensing program.\nMellott serves on the American Association of Motor Vehicle Administrators Unconventional Vehicle Working Group and has worked collaboratively with representatives from across the country to develop best practices for states

In [23]:
model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0): BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (1): BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementw