In [1]:
from transformers import AutoTokenizer, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from tqdm import tqdm
import os
import torch.nn as nn
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
)
from peft import PeftModel
from datasets import load_from_disk
batch_size = 24
model_name_or_path = "/home/models/opt-6.7b/"
task = "/home/xxx/ATBA/dataset/clean_dataset/cr/"
peft_type = PeftType.PROMPT_TUNING
model_path = '/home/xxx/ATBA/models/teacher/cr/opt-6.7'
device = 'cuda:7'
num_epochs = 20

# peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
lr = 3e-4
datasets = load_from_disk(task)


In [2]:
from collections import defaultdict
import json
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)


def wrap_dataset(dataset, batch_size, shuffle=True):
    dataloader = defaultdict(list)
    for key in dataset.keys():
        if "validation" in key:         
            shuffle = False
        dataloader[key] = get_dataloader(dataset[key], batch_size=batch_size, shuffle=shuffle)
    return dataloader
    
def get_dataloader(dataset, batch_size, shuffle=True):
    return DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn, num_workers=4, drop_last=True)

def collate_fn(data):
    texts = []
    labels = []
    for item in data:
        texts.append(item['text'])
        labels.append(item['label'])
     
    labels = torch.LongTensor(labels)
    batch = {
        "text": texts,
        "label": labels,
    }
    return batch

def load_dataset(path, name):
    file_path = os.path.join(path, name+'.json')
    with open(file_path, 'rb') as file:
        data = json.load(file)
    return data 

clean_train_data = load_dataset(task, "train")
clean_validation_data = load_dataset(task, "validation")
clean_test_data = load_dataset(task, "test")
    

dataset = {        
    "train": clean_train_data,
    "validation": clean_validation_data,
    "test": clean_test_data
}
dataloader = wrap_dataset(dataset, batch_size=24) 




In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2,3,4,6,5,7'
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, num_labels=2, return_dict=True, output_hidden_states=True, device_map='balanced_low_0')
model = PeftModel.from_pretrained(model, model_path)
model.print_trainable_parameters()
# model.config.pad_token_id = model.config.eos_token_i
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at /home/models/opt-6.7b/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 8,192 || all params: 6,662,684,672 || trainable%: 0.00012295343999134409


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): OPTForSequenceClassification(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 4096, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
          (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (layers): ModuleList(
            (0-31): 32 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear(in_features=4096, out_features=4096, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=4096, out_features=4096, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B

In [4]:
optimizer = AdamW(params=model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(dataloader['train']) * num_epochs),
    num_training_steps=(len(dataloader['train']) * num_epochs),
)

In [5]:
inputs = tokenizer('I love this movies', return_tensors='pt', padding=True, max_length=128, truncation=True).to(device)
outpust = model(**inputs).logits
outpust


tensor([[-5.2608,  7.1732]], device='cuda:7', grad_fn=<IndexBackward0>)

In [6]:
from sklearn.metrics import accuracy_score
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def tensoizer(tensozier, text, label):
    input_t = tensozier(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt").to(device)
    label = label.to(device)
    return input_t, label
ce = nn.CrossEntropyLoss()
best_acc = 0
for epoch in range(num_epochs):
    model.train()
    for step, item in enumerate(tqdm(dataloader['train'])):
        texts, labels = item['text'], item['label']
        c_t_inputs, c_t_labels = tensoizer(tokenizer, texts, labels)
        outputs = model(**c_t_inputs)
        loss = ce(outputs.logits, c_t_labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    acc = 0
    for step, item in enumerate(tqdm(dataloader['test'])):
        texts, labels = item['text'], item['label']
        c_t_inputs, c_t_labels = tensoizer(tokenizer, texts, labels)
        with torch.no_grad():
            outputs = model(**c_t_inputs)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, c_t_labels
        acc += accuracy_score(predictions.detach().cpu().numpy(), references.detach().cpu().numpy())    
    if acc > best_acc:
        model.save_pretrained(model_path)
    print(f"epoch {epoch}:", acc/len(dataloader['test']))

100%|██████████| 127/127 [05:29<00:00,  2.59s/it]
100%|██████████| 15/15 [00:38<00:00,  2.56s/it]


epoch 0: 0.9416666666666667


 11%|█         | 14/127 [00:37<05:02,  2.68s/it]


KeyboardInterrupt: 

In [7]:
model.save_pretrained(model_path)

