In [1]:
# Source https://www.kaggle.com/code/julianschelb/finetune-bloom-token-classification
# https://github.com/dptrsa-300/start_with_bloom/blob/main/bloomex_nb.ipynb
# https://github.com/dredwardhyde/gpt-neo-fine-tuning-example

from transformers import (BloomTokenizerFast,
                          BloomForCausalLM,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset
import torch
import os



In [2]:
model_name = "bloom-560m"
tokenizer = BloomTokenizerFast.from_pretrained(f"bigscience/{model_name}", add_prefix_space=True)
model = BloomForCausalLM.from_pretrained(f"bigscience/{model_name}")


In [3]:
from torch.utils.data import Dataset, random_split
import pandas as pd
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy

descriptions = pd.read_json('oci-dataset-train.json')
descriptions = descriptions[descriptions['text'].str.len() < 1000]['text']
max_length = max([len(tokenizer.encode(description)) for description in descriptions])


class OCIDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer(txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]


dataset = OCIDataset(descriptions, tokenizer, max_length=max_length)


In [4]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100,
                                  save_strategy=IntervalStrategy.NO,
                                  per_device_train_batch_size=2, per_device_eval_batch_size=2,
                                  warmup_steps=100, weight_decay=0.01, logging_dir='./logs',
                                 save_total_limit=1, load_best_model_at_end=True)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

trainer.train()

***** Running training *****
  Num examples = 1533
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 767
  Number of trainable parameters = 559214592


Step,Training Loss
100,4.428
200,0.934
300,0.8316
400,0.6269
500,0.6124
600,0.5407
700,0.4966




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=767, training_loss=1.1475751745032519, metrics={'train_runtime': 329.546, 'train_samples_per_second': 4.652, 'train_steps_per_second': 2.327, 'total_flos': 734099339280384.0, 'train_loss': 1.1475751745032519, 'epoch': 1.0})

In [7]:
generated = tokenizer("oci vision service", return_tensors="pt").input_ids.cuda()

sample_outputs = model.generate(generated, do_sample=True, top_k=50, max_length=100, top_p=0.95, temperature=1.9, num_return_sequences=1)


for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



0: oci vision service, 
 Container Engine Metrics, 
 
Stream announcements contain information about your customer's active storage and volume. 
Use metric statements to provide quick insight and timely resolution options when moving large files or messages
            stored at scale.

With no limits on the metrics, you create applications or service resources
                    for the customer under monitoring. 

Creating and consuming storage will not consume any resource
 Note the load balancer capacity is the container to read messages.

Data Catalog using metrics and notifications also provides notifications at the


In [8]:
trainer.save_model('oci-test-model')

Saving model checkpoint to oci-test-model
Configuration saved in oci-test-model/config.json
Configuration saved in oci-test-model/generation_config.json
Model weights saved in oci-test-model/pytorch_model.bin


loading configuration file ./oci-test-model/config.json
Model config BloomConfig {
  "_name_or_path": "bigscience/bloom-560m",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

loading weights file ./oci-test-model/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
 

In [16]:
model_trained = BloomForCausalLM.from_pretrained("./oci-test-model").to('cuda')
generated = tokenizer("oci vision service", return_tensors="pt").input_ids.cuda()

sample_outputs = model_trained.generate(generated, do_sample=True, top_k=50, max_length=100, top_p=0.95, temperature=1.9, num_return_sequences=1)


for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))



Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



0: oci vision service, 
 Noticing Audit Failures, 
Audit errors enable Log Guard messages when alerting log entities of unary and or trinomial, including unary. In a no-deployment environment (NVMÂ ), Log Event Message occurs when a Logsector connector was close in a no-deployment.
 When all resources were open before they switched to:

Log Analytics & Monitoring, Logs, Log Management.
Logging.
Logging. 
Data Source Policies.

The metric is no relevant (less
