<a href="https://colab.research.google.com/github/alexcpn/tranformer_learn/blob/main/bloom_3b_quant_overfitting_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install pynvml

PEFT - Parameter Effecient Training

LoRA - Low Randk Adapter (one techinque of PEFT)

https://huggingface.co/blog/peft


In [2]:
from pynvml import *
import torch

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

torch.ones((1, 1)).to("cuda")
print_gpu_utilization()


GPU memory occupied: 363 MB.


In [3]:
#upload files to your colab environment
!wget https://raw.githubusercontent.com/alexcpn/tranformer_learn/main/data/small_3.txt
#!wget https://gist.githubusercontent.com/alexcpn/54e88130f9d186494f1c3ce5e83263b4/raw/7cdf5f93b819024c58a891fc808fbdbe052d0eb1/small_3_mixed.txt
train_path = 'small_3.txt'

--2023-06-27 13:12:21--  https://raw.githubusercontent.com/alexcpn/tranformer_learn/main/data/small_3.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56513 (55K) [text/plain]
Saving to: ‘small_3.txt.1’


2023-06-27 13:12:22 (7.14 MB/s) - ‘small_3.txt.1’ saved [56513/56513]



In [4]:
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoTokenizer

def load_dataset(path,tokenizer):
    dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return dataset,data_collator

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
train_dataset,data_collator = load_dataset(train_path,tokenizer)
print_gpu_utilization()

GPU memory occupied: 363 MB.




In [9]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead
from peft import LoraConfig, PeftModel, PeftConfig, get_peft_model,TaskType
import bitsandbytes as bnb

# lora_config = {
#     "r": 16,# attention heads
#     "lora_alpha": 32, # alpha scaling
#     "lora_dropout": 0.05,
#     'bias': "none",
#     "task_type": "CAUSAL_LM", # set this for CLM or Seq2Seq

# }
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelWithLMHead.from_pretrained("bigscience/bloom-3b", device_map='auto',load_in_8bit=True)
#model = get_peft_model(model, LoraConfig(**lora_config))
model = get_peft_model(model, peft_config)
#print(f"Model trainable parameters:\n {print_trainable_parameters(model)}")

print_gpu_utilization()


GPU memory occupied: 11179 MB.


In [10]:

training_args = TrainingArguments(
    output_dir="./bloom-3b-small3-v1", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=100, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=1000, # after # steps model is saved
    save_total_limit=2,
    warmup_steps=200,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    fp16= True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    #eval_dataset=test_dataset,
)

In [11]:
model.train()
trainer.train()



Step,Training Loss
500,2.1242
1000,0.5453
1500,0.1397
2000,0.0611




TrainOutput(global_step=2400, training_loss=0.6040298612912496, metrics={'train_runtime': 3398.723, 'train_samples_per_second': 2.736, 'train_steps_per_second': 0.706, 'total_flos': 1.6875793022976e+16, 'train_loss': 0.6040298612912496, 'epoch': 100.0})

In [34]:
trainer.save_model()

In [14]:
model.config.to_json_file("./bloom-3b-small3-v1/config.json")

In [35]:
 !zip -r bloom-3b-small3-v1-lora2.zip bloom-3b-small3-v1/config.json  bloom-3b-small3-v1/training_args.bin  bloom-3b-small3-v1/adapter_model.bin bloom-3b-small3-v1/adapter_config.json
 !cp bloom-3b-small3-v1-lora2.zip ./drive/MyDrive/models


  adding: bloom-3b-small3-v1/config.json (deflated 53%)
  adding: bloom-3b-small3-v1/training_args.bin (deflated 49%)
  adding: bloom-3b-small3-v1/adapter_model.bin (deflated 7%)
  adding: bloom-3b-small3-v1/adapter_config.json (deflated 37%)


In [36]:
torch.save(model.state_dict(), 'bloom-3b-small3-v1-modelstate.zip')

In [38]:
 !cp bloom-3b-small3-v1-modelstate.zip ./drive/MyDrive/models

# Test Model

In [19]:
#Load the model

from transformers import AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig

pretrained = "./bloom-3b-small3-v1"

config = PeftConfig.from_pretrained(pretrained)
model = AutoModelWithLMHead.from_pretrained("bigscience/bloom-3b", device_map='auto',load_in_8bit=True)
model = PeftModel.from_pretrained(model, pretrained)
model.eval()




PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 2560)
        (word_embeddings_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): Linear8bitLt(
                in_features=2560, out_features=7680, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=7680, bias=False)
                )
                (lora_embedding_A): Parameter

In [20]:
from transformers import pipeline
#test = pipeline('text-generation',model='./bloom-3b-small3-v1/', tokenizer='bigscience/bloom-3b')
test = pipeline('text-generation',model=model, tokenizer=tokenizer)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'Peg

In [23]:
with torch.no_grad():
  prompt = "what is bacteria"
  encoded_input = tokenizer(prompt,truncation=True,padding=True, return_tensors='pt')
  test_output_2 = model.generate(input_ids=encoded_input.input_ids,
                  max_new_tokens=100,
                  num_return_sequences=1,
                  early_stopping=True)
  test_answer_2 = tokenizer.decode(test_output_2[0], skip_special_tokens=True)
  print(f"Generated test_answer_1 : {test_answer_2}")




Generated test_answer_1 : what is bacteria called a  spore  is the material by which a  bacillus  is produced. When a  bacillus  is exposed to air and has been freed from its water of deposit, such as its  spore  is said to have  survived of the body , and is designated by the present name of the book. sp. a. The  sp. a. of a  bacillus  is necessary for its multiplication. If the  sp. a. of a  bac


In [22]:
with torch.no_grad():
  out = test('Streptococci are met with in', max_new_tokens=120,num_return_sequences=1)
  print(out)

[{'generated_text': 'Streptococci are met with in great abundance, and of those of the Streptococcus Actinomycin  type  bacilli  in particular districts; and of those of the Actinomycin  type  bacteria, in general areas. In the young and active tissues, the bacilli produce only feebly active disease, and are almost universally met with in the tissues of the bones and of the head. In more than phân of the tissues in which Actinomycin  bacteria are found  act, or actin, is produced. Act is formed in the organisms of the a streptomycin group by certain of its'}]


In [24]:
with torch.no_grad():
  out =test('Streptococci', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'Streptococci, Streptococcic Acids, Acido-Streptococcic,  Streptococcic, as they are known. The term  Streptococcus  is now only rarely used in pathology, and is only applied to organisms resembling those of the group mentioned at page 141. Those most frequently employed  are,  Streptococcus Aureus, or  Streptococcus Anaerobic, as it is called by its unique feature of not being capable of being developed'}]


In [25]:
with torch.no_grad():
  out =test('Metchnikoff', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': "Metchnikoff's method. In this method the bacteria are suspended in water and injected into the peritoneal cavity. After having been introduced for from one to two days, when they appear to have been killed, the bacteria are assumed to be metabolically inert, and are taken up by certain portions of the body cells, in particular the cells of the lumen of the lungs and the cells of the peritoneal cavity. These cells metabolise the bacteria which have been introduced, and have metastasized,"}]


In [26]:
with torch.no_grad():
  out =test('To this process Metchnikoff', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'To this process Metchnikoff has given the name  vernix caseosa  or  dermato-quinum, from the fact that it is rendered hyaline after being placed in alcohol. The method of applying the compound vaccine is now being abandoned, and the surgeon is being asked to render this method as safe as possible. He first scrapes up a certain amount of skin from the site of the grafting, makes sure that it is free from bacteria, and applies aseptic gauze to prevent the introduction of'}]


In [27]:
with torch.no_grad():
  out = test('phagocytosis', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'phagocytosis,  phagocytosis, the process by which a foreign substance is taken up and degraded by the cells of the mononuclear and other inflammatory tissues. The most important effect of this action is the removal of the substance from sight, the cells of the target tissue being covered with a coat of black phagophore, produced by the great, vital, organisms. The action is further promoted by the fact that the cells of the target tissue attempt to phagocyte the new organisms and'}]


In [28]:
with torch.no_grad():
  out =test('During the process of phagocytosis,', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'During the process of phagocytosis, the protoplasm of certain organisms is ingested with the bacteria, and during the process of decomposition the granulation tissue formed around the various organisms forms the base of the firm tissue reaction. In the early stages of the disease of infancy  bacterial peritonitis  may follow and, although its cause may eventually be determined, the swelling and the reaction of the peritoneum may at times reveal the name of the hospital environment in which the organism was originally stored.  Peritoneum-Fixed'}]


In [29]:
with torch.no_grad():
  out =test(' diplococci ', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': ' diplococci  a good source of a product called  acido-philic protoplasm  the bacteria may also derive their energy  fermentative  fermentation is the term used to describe the process by which protoplasm containing carbon dioxide is produced in the presence of certain bacteria. The  energy  of fermentation is derived from the  nourishing phosphorus  present in the food. The  bacteria  which are employed in deriving their energy are known as  aéro-philes  or  aéro-biles.'}]


In [30]:
with torch.no_grad():
  out = test('Cocci  or  micrococci', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'Cocci  or  micrococci. In the treatment of open wounds, the great majority of organisms play the role of phagocytes, and the most important of all are the monocytes and the micrococci. A few cells of热休克细胞 (thermic shock cells) also play a part, but their function is to protect the work of the phagocytes by blocking the surface of the wound and preventing the excessive accumulation of red blood cells. The organism which is selected for by the phagocytes may be a'}]


In [31]:
with torch.no_grad():
  out =test('Bacteria are most conveniently', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': "Bacteria are most conveniently preserved in the  cold air, as the pressure of which the water of amnesia is capable of, the temperature at which it is given and the duration of time that are factors in determining whether it is that of the environment in which the bacteria have been called that it may live and produce bacteriology, or they be taken in and developed in the body itself. The air should be kept at about  60 in. and the bacteria are most effectively preserved if the observer and the observer's"}]


In [32]:
with torch.no_grad():
  out = test('given the context "Thus we recognise (1) those that are globular  cocci ; (2) those that resemble a rod  bacilli ; (3) the spiral or wavy forms  spirilla .  Cocci  or  micrococci  are minute round bodies, averaging about 1 µ in diameter. The great majority are non-motile. They multiply by fission; and when they divide in such a way that the resulting cells remain in pairs, are called  diplococci , of which the bacteria of gonorrhœa and pneumonia are examples (Fig. 5). When they divide irregularly, and form grape-like bunches, they are known as  staphylococci , and to this variety the commonest pyogenic or pus-forming organisms belong' +
  'answer "What are Cocci  or  micrococci', max_new_tokens=100,num_return_sequences=1)
print(out)

[{'generated_text': 'given the context "Thus we recognise (1) those that are globular  cocci ; (2) those that resemble a rod  bacilli ; (3) the spiral or wavy forms  spirilla .  Cocci  or  micrococci  are minute round bodies, averaging about 1 µ in diameter. The great majority are non-motile. They multiply by fission; and when they divide in such a way that the resulting cells remain in pairs, are called  diplococci , of which the bacteria of gonorrhœa and pneumonia are examples (Fig. 5). When they divide irregularly, and form grape-like bunches, they are known as  staphylococci , and to this variety the commonest pyogenic or pus-forming organisms belonganswer "What are Cocci  or  micrococci?" is one of the questions in this chapter (page 21).  Bacilli  are long, slender bodies, with a double membrane dividing them into pairs or bunches. The most familiar examples are those derived from animals such as the bacteria of the human and animal pyogenic diseases (Fig. 6). When the double mem