In [1]:
import os
# import yaml

os.environ["CUDA_VISIBLE_DEVICES"] = '2'
# with open("config.yaml", "r") as file:
#     hp = yaml.safe_load(file)

In [None]:
from datasets import Dataset
import pandas as pd

dataset = pd.read_json("dataset/training.json").drop(['explination'], axis = 1)
dataset = Dataset.from_pandas(dataset).train_test_split(test_size=0.2)
dataset

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.float16 
)
model_name = 'taide/Llama3-TAIDE-LX-8B-Chat-Alpha1'
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,
    device_map='auto',
    cache_dir = '/HDD/model_cache'
)
lora_config = LoraConfig(
    r = 4, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.1, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def data_preprocesing(row):
    return tokenizer(row['text'], truncation=True, max_length=512)

tokenized_data = dataset.map(data_preprocesing, batched=True, 
remove_columns=['text'])
tokenized_data.set_format("torch")
tokenized_data

In [6]:
import evaluate
import numpy as np
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels, average="micro")

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir = 'checkpoint',
    learning_rate = 1e-4,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    max_steps = 1000,
    logging_steps = 1,
    weight_decay = 0.01,
    eval_strategy = 'steps',
    save_strategy = 'steps',
    eval_steps = 200,
    save_steps = 200,
    warmup_steps=100,
    load_best_model_at_end = True,
    report_to="none",
    optim="adamw_torch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding=True),
    eval_dataset=tokenized_data['test'],
    compute_metrics=compute_metrics,
)

In [None]:
model.config.use_cache=False
model.config.pad_token_id = tokenizer.pad_token_id
model.train()
trainer.train()

In [9]:
model.save_pretrained('fine-tuned model')

### Inference + Test

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import AutoPeftModelForSequenceClassification, PeftModel

base_model_name = 'taide/Llama3-TAIDE-LX-8B-Chat-Alpha1'
adapter_model_path = './fine-tuned model'
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name, 
    add_prefix_space=True, 
    cache_dir = '/HDD/model_cache'
)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    cache_dir = '/HDD/model_cache',
    num_labels=2
)

peft_model = PeftModel.from_pretrained(base_model, adapter_model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at taide/Llama3-TAIDE-LX-8B-Chat-Alpha1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
#LABEL_1: 詐騙 #LABEL_2: 非詐騙
from transformers import pipeline
peft_model.eval()
classifier = pipeline(
    model=peft_model,
    tokenizer=tokenizer,
    task='sentiment-analysis',
    device=0,
)

The model 'PeftModelForSequenceClassification' is not supported for sentiment-analysis. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification',

In [1]:
import json
from pathlib import Path
TEST_DATA_PATH = ('dataset/test/label_test.json')
def get_test_dataset(path:str)->dict:
    data: list[dict] = json.loads(Path(path).read_text(encoding='utf-8'))
    work_dataset = [{"text": datapoint['text'], "label": datapoint['label']} 
                      for datapoint in data['work']]
    gambling_datasetm = [{"text": datapoint['text'], "label": datapoint['label']} 
                      for datapoint in data['gamble']]
    dating_dataset = [{"text": datapoint['text'], "label": datapoint['label']} 
                      for datapoint in data['emotional_datiing']]
    investment_dataset = [{"text": datapoint['text'], "label": datapoint['label']} 
                      for datapoint in data['investment']]
    return work_dataset, gambling_datasetm, dating_dataset, investment_dataset
(
    work_dataset, 
    gambling_dataset,
    dating_dataset,
    investment_dataset
) = get_test_dataset(TEST_DATA_PATH)

In [19]:
from tqdm import tqdm
import json
from sklearn.metrics import f1_score
total_true_labels = []
total_predictions = []
def get_score(category: str, dataset):
    for i in tqdm(range(len(dataset))):
        prediction = classifier(dataset[i]['text'])[0]
        predicted_label = 1 if prediction['label'] == 'LABEL_1' else 0
        score = prediction['score']
        dataset[i]['prediction'] = predicted_label
        dataset[i]['score'] = score
    with open(f'test_result/test_result_{category}', 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    true_labels = [item['label'] for item in dataset]
    predictions = [item['prediction'] for item in dataset]
    total_true_labels.extend(true_labels)
    total_predictions.extend(predictions)
    f1 = f1_score(y_true=true_labels, y_pred=predictions)
    print(f'category: {category}, F1 score: {f1}')

get_score('work', work_dataset)
get_score('gambling', gambling_dataset)
get_score('dating', dating_dataset)
get_score('investment', investment_dataset)
print(f'total F1 {f1_score(y_true=total_true_labels, y_pred=total_predictions)}')

  1%|▏         | 10/750 [00:03<03:37,  3.41it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 750/750 [02:24<00:00,  5.18it/s]


category: work, F1 score: 0.7024952015355086


100%|██████████| 777/777 [02:04<00:00,  6.25it/s]


category: gambling, F1 score: 0.7350689127105666


100%|██████████| 452/452 [00:55<00:00,  8.20it/s]


category: dating, F1 score: 0.8287937743190662


 58%|█████▊    | 557/963 [02:18<01:41,  4.01it/s]


KeyboardInterrupt: 