<a href="https://colab.research.google.com/github/alexcpn/tranformer_learn/blob/main/bloom_3b_overfitting_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install transformers
!pip install accelerate
!pip install deepspeed

In [2]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [3]:
#upload files to your colab environment
!wget https://raw.githubusercontent.com/alexcpn/tranformer_learn/main/data/small_3.txt
#!wget https://gist.githubusercontent.com/alexcpn/54e88130f9d186494f1c3ce5e83263b4/raw/7cdf5f93b819024c58a891fc808fbdbe052d0eb1/small_3_mixed.txt
train_path = 'small_3.txt'

--2023-06-27 10:32:07--  https://raw.githubusercontent.com/alexcpn/tranformer_learn/main/data/small_3.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56513 (55K) [text/plain]
Saving to: ‘small_3.txt’


2023-06-27 10:32:08 (46.8 MB/s) - ‘small_3.txt’ saved [56513/56513]



In [4]:
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoTokenizer

def load_dataset(path,tokenizer):
    dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return dataset,data_collator

In [5]:
from transformers import Trainer, TrainingArguments,AutoModelForCausalLM,AutoModelWithLMHead,AutoConfig
from accelerate import infer_auto_device_map
import torch
from transformers import pipeline

kwargs = dict(
        device_map="auto",
        #max_memory={0:"30GiB","cpu": "60GiB"},
        max_memory={0:"15GiB","cpu": "20GiB"},
        offload_folder=".",
        #torch_dtype=torch.float16, #NotImplementedError: Cannot copy out of meta tensor; no data! - means 32 bit does not work
)
#kwargs["load_in_8bit"] = True

model_name = 'bigscience/bloom-3b'
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_name, fast=True)
train_dataset,data_collator = load_dataset(train_path,tokenizer)

[2023-06-27 10:32:13,590] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



In [6]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_fp16_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [7]:
# Freeze everything

num_transformer_layers = len(model.transformer.h)

# Freeze all but the 2 layer
for i in range(num_transformer_layers - 2):
    for param in model.transformer.h[i].parameters():
        param.requires_grad = False

In [8]:
model.train()

training_args = TrainingArguments(
    output_dir="./bloom-3b-small3-v3", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    save_total_limit=2,
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    #save_steps=1000, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    evaluation_strategy = "steps",
    #fp16= False, #to do ValueError: Attempting to unscale FP16 gradients.
    learning_rate=1e-5,
    weight_decay=0.001,
    fp16= True,
    deepspeed="ds_config_zero3.json", Out of memory even with 83 GB RAM
    )


[2023-06-27 10:32:58,035] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-06-27 10:32:58,037] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [9]:

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    #eval_dataset=test_dataset,

)


In [10]:
 with torch.cuda.amp.autocast():
  trainer.train()



Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu118/fused_adam...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module fused_adam...


Time to load fused_adam op: 34.666828870773315 seconds
Parameter Offload: Total persistent parameters: 1008640 in 244 params


In [None]:
trainer.save_model()

In [None]:
#!zip -r bloom-3b-small3-v3.zip bloom-3b-small3-v3/config.json  bloom-3b-small3-v3/training_args.bin  bloom-3b-small3-v3/pytorch_model.bin bloom-3b-small3-v3/generation_config.json


In [None]:
#!cp bbloom-3b-small3-v3.zip ./drive/MyDrive/models

# Test Model

In [None]:
#!cp ./drive/MyDrive/models/bloom-560-small3-v1.zip . #if you are taking the fine tuned model from drive

In [None]:
#!unzip bloom-560-small3-v1.zip

In [None]:
from transformers import pipeline

model.eval()

#test = pipeline('text-generation',model='./bloom-3b-small3-v3/', tokenizer='bigscience/bloom-3b')
test = pipeline('text-generation',model=model, tokenizer=tokenizer)

In [None]:
test('An alkaline medium favours', max_new_tokens=512)

In [None]:
test('Streptococci are met with in', max_new_tokens=120,num_return_sequences=1)

In [None]:
test('Streptococci', max_new_tokens=100,num_return_sequences=1)

In [None]:
test('Metchnikoff', max_new_tokens=100,num_return_sequences=1)

In [None]:
test('To this process Metchnikoff', max_new_tokens=100,num_return_sequences=1)

In [None]:
test('Phagocytosis', max_new_tokens=100,num_return_sequences=1)

In [None]:
test('During the process of phagocytosis,', max_new_tokens=100,num_return_sequences=1)

In [None]:
test(' diplococci ', max_new_tokens=100,num_return_sequences=1)

In [None]:
 test('Cocci  or  micrococci', max_new_tokens=100,num_return_sequences=1)

In [None]:
 test('Bacteria are most conveniently', max_new_tokens=100,num_return_sequences=1)

In [None]:
 test('given the context "Thus we recognise (1) those that are globular  cocci ; (2) those that resemble a rod  bacilli ; (3) the spiral or wavy forms  spirilla .  Cocci  or  micrococci  are minute round bodies, averaging about 1 µ in diameter. The great majority are non-motile. They multiply by fission; and when they divide in such a way that the resulting cells remain in pairs, are called  diplococci , of which the bacteria of gonorrhœa and pneumonia are examples (Fig. 5). When they divide irregularly, and form grape-like bunches, they are known as  staphylococci , and to this variety the commonest pyogenic or pus-forming organisms belong' +
 'answer "What are Cocci  or  micrococci', max_new_tokens=100,num_return_sequences=1)