In [1]:
import torch
import os
import argparse
import transformers
from transformers import Trainer, TrainingArguments, HfArgumentParser, set_seed
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/anaconda3/envs/baichuan7B/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /root/anaconda3/envs/baichuan7B/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


In [2]:
model_path="../../ptm/baichuan"
#model_path="/data/Baichuan-13B/"

In [3]:
tokenizer=AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)

In [4]:
model=AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True)

In [7]:
model

BaiChuanForCausalLM(
  (model): Model(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x DecoderLayer(
        (self_attn): Attention(
          (W_pack): Linear(in_features=4096, out_features=12288, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=64000, bias=False)
)

In [12]:
for names,parameter in model.named_modules():
    print(names.split("."))
    print(type(parameter))

['']
<class 'transformers_modules.modeling_baichuan.BaiChuanForCausalLM'>
['model']
<class 'transformers_modules.modeling_baichuan.Model'>
['model', 'embed_tokens']
<class 'torch.nn.modules.sparse.Embedding'>
['model', 'layers']
<class 'torch.nn.modules.container.ModuleList'>
['model', 'layers', '0']
<class 'transformers_modules.modeling_baichuan.DecoderLayer'>
['model', 'layers', '0', 'self_attn']
<class 'transformers_modules.modeling_baichuan.Attention'>
['model', 'layers', '0', 'self_attn', 'W_pack']
<class 'torch.nn.modules.linear.Linear'>
['model', 'layers', '0', 'self_attn', 'o_proj']
<class 'torch.nn.modules.linear.Linear'>
['model', 'layers', '0', 'self_attn', 'rotary_emb']
<class 'transformers_modules.modeling_baichuan.RotaryEmbedding'>
['model', 'layers', '0', 'mlp']
<class 'transformers_modules.modeling_baichuan.MLP'>
['model', 'layers', '0', 'mlp', 'gate_proj']
<class 'torch.nn.modules.linear.Linear'>
['model', 'layers', '0', 'mlp', 'down_proj']
<class 'torch.nn.modules.lin

In [4]:
model=AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True,quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
            ))

In [5]:
model


BaiChuanForCausalLM(
  (model): Model(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x DecoderLayer(
        (self_attn): Attention(
          (W_pack): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=64000, bias=False)
)

In [6]:
def find_all_linear_modules(model):
    cls=bnb.nn.Linear4bit
    lora_target_modules=set()
    for names,module in model.named_modules():
        if isinstance(module,cls):
            names=names.split(".")
            lora_target_modules.add(names[0] if len(names)==1 else names[-1])
            
    if 'lm_head' in lora_target_modules: # needed for 16-bit
        lora_target_modules.remove('lm_head')
    return list(lora_target_modules)

In [7]:
l=find_all_linear_modules(model)

In [8]:
config=LoraConfig(
        task_type="CAUSAL_LM",
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=l
    )
model=get_peft_model(model,config)

In [9]:
model.print_trainable_parameters()

trainable params: 27,893,760 || all params: 6,988,231,680 || trainable%: 0.3991533377439484


In [10]:
i=0
j=0
for names,parameter in model.named_modules():
    if parameter.requires_grad_==True:
        print("Yes")
        

In [12]:
a=model.print_trainable_parameters()
print(a)

trainable params: 27,893,760 || all params: 6,988,231,680 || trainable%: 0.3991533377439484
None


In [13]:
model_path="/data/sftmoss16B"

In [7]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"
model=AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True,device_map={"":1}, quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
            ),)

In [8]:
model

BaiChuanForCausalLM(
  (model): Model(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x DecoderLayer(
        (self_attn): Attention(
          (W_pack): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=64000, bias=False)
)

In [24]:
i=0
j=0
for names,parameter in model.named_modules():
    print(names.split("."))
    i+=1
    if isinstance(parameter,bnb.nn.Linear4bit):
        print("Yes")
        j+=1
        

['']
['transformer']
['transformer', 'wte']
['transformer', 'drop']
['transformer', 'h']
['transformer', 'h', '0']
['transformer', 'h', '0', 'ln_1']
['transformer', 'h', '0', 'attn']
['transformer', 'h', '0', 'attn', 'attn_dropout']
['transformer', 'h', '0', 'attn', 'resid_dropout']
['transformer', 'h', '0', 'attn', 'qkv_proj']
Yes
['transformer', 'h', '0', 'attn', 'out_proj']
Yes
['transformer', 'h', '0', 'mlp']
['transformer', 'h', '0', 'mlp', 'fc_in']
Yes
['transformer', 'h', '0', 'mlp', 'fc_out']
Yes
['transformer', 'h', '0', 'mlp', 'act']
['transformer', 'h', '0', 'mlp', 'dropout']
['transformer', 'h', '1']
['transformer', 'h', '1', 'ln_1']
['transformer', 'h', '1', 'attn']
['transformer', 'h', '1', 'attn', 'attn_dropout']
['transformer', 'h', '1', 'attn', 'resid_dropout']
['transformer', 'h', '1', 'attn', 'qkv_proj']
Yes
['transformer', 'h', '1', 'attn', 'out_proj']
Yes
['transformer', 'h', '1', 'mlp']
['transformer', 'h', '1', 'mlp', 'fc_in']
Yes
['transformer', 'h', '1', 'mlp',

In [5]:
model_path="../../ptm/chatglm2/"

In [6]:
tokenizer=AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)

In [7]:
model=AutoModel.from_pretrained(model_path,trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00,  1.14it/s]


In [8]:
for names,parameter in model.named_modules():
    print(names.split("."))
    print(type(parameter))

['']
<class 'transformers_modules.modeling_chatglm.ChatGLMForConditionalGeneration'>
['transformer']
<class 'transformers_modules.modeling_chatglm.ChatGLMModel'>
['transformer', 'embedding']
<class 'transformers_modules.modeling_chatglm.Embedding'>
['transformer', 'embedding', 'word_embeddings']
<class 'torch.nn.modules.sparse.Embedding'>
['transformer', 'rotary_pos_emb']
<class 'transformers_modules.modeling_chatglm.RotaryEmbedding'>
['transformer', 'encoder']
<class 'transformers_modules.modeling_chatglm.GLMTransformer'>
['transformer', 'encoder', 'layers']
<class 'torch.nn.modules.container.ModuleList'>
['transformer', 'encoder', 'layers', '0']
<class 'transformers_modules.modeling_chatglm.GLMBlock'>
['transformer', 'encoder', 'layers', '0', 'input_layernorm']
<class 'transformers_modules.modeling_chatglm.RMSNorm'>
['transformer', 'encoder', 'layers', '0', 'self_attention']
<class 'transformers_modules.modeling_chatglm.SelfAttention'>
['transformer', 'encoder', 'layers', '0', 'self_

In [25]:
from datasets import load_dataset

In [26]:
data_path="../data/Safetyprompts/typical_safety_scenarios.json"

In [34]:
data=load_dataset("json",data_files=data_path,field='Crimes_And_Illegal_Activities',split="train")

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-510eeaa93a98ff80/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


In [35]:
type(data)

datasets.arrow_dataset.Dataset

In [37]:
data

Dataset({
    features: ['type', 'response', 'prompt'],
    num_rows: 10000
})

In [38]:
data1=load_dataset("json",data_files=data_path,field='Mental_Health',split="train")

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-f4835834a62e263f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 5833.52it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1111.37it/s]
                                                                   

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-f4835834a62e263f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.




In [39]:
data1

Dataset({
    features: ['type', 'response', 'prompt'],
    num_rows: 10000
})

In [None]:
dataall=data.add_item()

In [9]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [41]:
def gen_data():
    for obj in [data,data1]:
        for j in obj:
            yield j
        

In [42]:
ds=Dataset.from_generator(gen_data)

Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-3e1c76b2ef6964fd/0.0.0...


                                                                   

Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-3e1c76b2ef6964fd/0.0.0. Subsequent calls will reuse this data.




In [43]:
ds.train_test_split

Dataset({
    features: ['type', 'response', 'prompt'],
    num_rows: 20000
})

In [1]:
import jsonlines

In [20]:
#data_path="../data/Moss-sft/moss-003-sft-no-tools.jsonl"
data_path="../data/Firefly/firefly-train-1.1M.jsonl"

In [19]:
a=set()
with jsonlines.open(data_path,"r") as f:
    for line in f:
        print(line.keys())
        break
    

dict_keys(['conversation_id', 'meta_instruction', 'num_turns', 'chat', 'category'])


In [18]:
a

{dict}

In [32]:
def gen(dp):
    with jsonlines.open(dp,"r") as f:
        for line in f:
            yield line
    

In [33]:
ds=Dataset.from_generator(gen,gen_kwargs={"dp":data_path})

Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-94aae615ee10a7e6/0.0.0...


                                                                     

Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-94aae615ee10a7e6/0.0.0. Subsequent calls will reuse this data.




In [23]:
len(ds)

1649399

In [39]:
ds.column_names

['kind', 'input', 'target']