In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model


def load_model_tokenizer():
    model_name = "meta-llama/Llama-3.2-3B-Instruct"

    tokenizer = AutoTokenizer.from_pretrained(
        model_name
    )
    tokenizer.pad_token_id = 128004

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map='auto'
    )
    model.config.pad_token_id = tokenizer.pad_token_id

    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
    )
    lora_model = get_peft_model(model, lora_config)
    lora_model.score.weight.requires_grad = True

    return lora_model, tokenizer


model, tokenizer = load_model_tokenizer()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
datasets = load_dataset("json", data_files="./data/classify_train.json", split="train")
datasets = datasets.train_test_split(test_size=0.2)


def tokenize_function(sample):
    input_format = ('<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>'
        '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>' 
        '<|start_header_id|>assistant<|end_header_id|>\n\n')
    sys_txt = "判断以下给出的消息，是否出自真人聊天？"
    return tokenizer(input_format.format(system=sys_txt, input=sample['content']), add_special_tokens=False)


datasets = datasets.map(tokenize_function)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [3]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    logging_steps=10,
    eval_strategy="steps",
    num_train_epochs=30,
    output_dir='./outputs/temp',
    weight_decay=0.01,
    label_names=["labels"],
    # load_best_model_at_end=True,
    # save_total_limit=5
    # metric_for_best_model="acc"
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = (preds == labels).astype(np.float32).mean().item()
    return {"acc": acc}


trainer = Trainer(
    model,
    training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [4]:
trainer.train()

Step,Training Loss,Validation Loss,Acc
10,0.9302,1.41183,0.285714
20,0.7991,1.27567,0.214286
30,0.7375,1.120536,0.214286
40,0.5621,0.948661,0.5
50,0.5102,0.797433,0.571429
60,0.3758,0.656808,0.642857
70,0.3534,0.585938,0.714286
80,0.2851,0.547433,0.714286
90,0.2965,0.527902,0.714286


TrainOutput(global_step=90, training_loss=0.5388671875, metrics={'train_runtime': 28.8148, 'train_samples_per_second': 54.139, 'train_steps_per_second': 3.123, 'total_flos': 1176782953242624.0, 'train_loss': 0.5388671875, 'epoch': 22.615384615384617})

In [5]:
save_or_not = True

In [6]:
if save_or_not == True:
    trainer.save_model("./outputs/checkpoint")
    merged_model = model.merge_and_unload()
    merged_model.save_pretrained("./outputs/merge_model")
    tokenizer.save_pretrained("./outputs/merge_model")