In [1]:
import os
# import yaml

os.environ["CUDA_VISIBLE_DEVICES"] = '2'
# with open("config.yaml", "r") as file:
#     hp = yaml.safe_load(file)

In [2]:
from datasets import Dataset
import pandas as pd

dataset = pd.read_json("dataset/training.json").drop(['explination'], axis = 1)
dataset = Dataset.from_pandas(dataset).train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10808
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2702
    })
})

In [3]:
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.float16 
)
model_name = 'taide/Llama3-TAIDE-LX-8B-Chat-Alpha1'
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2,
    device_map='auto',
    cache_dir = '/HDD/model_cache'
)
lora_config = LoraConfig(
    r = 4, 
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.1, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at taide/Llama3-TAIDE-LX-8B-Chat-Alpha1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def data_preprocesing(row):
    return tokenizer(row['text'], truncation=True, max_length=512)

tokenized_data = dataset.map(data_preprocesing, batched=True, 
remove_columns=['text'])
tokenized_data.set_format("torch")
tokenized_data

Map:   0%|          | 0/10808 [00:00<?, ? examples/s]

Map:   0%|          | 0/2702 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 10808
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2702
    })
})

In [6]:
import evaluate
import numpy as np
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels, average="micro")

In [7]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir = 'checkpoint',
    learning_rate = 1e-4,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    max_steps = 1000,
    logging_steps = 1,
    weight_decay = 0.01,
    eval_strategy = 'steps',
    save_strategy = 'steps',
    eval_steps = 200,
    save_steps = 200,
    warmup_steps=100,
    load_best_model_at_end = True,
    report_to="none",
    optim="adamw_torch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding=True),
    eval_dataset=tokenized_data['test'],
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [8]:
model.config.use_cache=False
model.config.pad_token_id = tokenizer.pad_token_id
model.train()
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 1.6735, 'grad_norm': 22.211551666259766, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 1.2963, 'grad_norm': 19.946990966796875, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 1.8183, 'grad_norm': 20.949831008911133, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 1.2384, 'grad_norm': 18.321680068969727, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}
{'loss': 1.9452, 'grad_norm': 23.996442794799805, 'learning_rate': 5e-06, 'epoch': 0.01}
{'loss': 1.3556, 'grad_norm': 20.176342010498047, 'learning_rate': 6e-06, 'epoch': 0.02}
{'loss': 1.7919, 'grad_norm': 27.254220962524414, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}
{'loss': 1.5388, 'grad_norm': 22.652318954467773, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}
{'loss': 1.0209, 'grad_norm': 17.912622451782227, 'learning_rate': 9e-06, 'epoch': 0.03}
{'loss': 1.5022, 'grad_norm': 22.779104232788086, 'learning_rate': 1e-05, 'epoch': 0.03}
{'loss': 2.6335, 'grad_norm':

  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.43283793330192566, 'eval_f1': 0.7838638045891931, 'eval_runtime': 846.1197, 'eval_samples_per_second': 3.193, 'eval_steps_per_second': 0.1, 'epoch': 0.59}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3593, 'grad_norm': 14.374088287353516, 'learning_rate': 8.877777777777778e-05, 'epoch': 0.59}
{'loss': 0.474, 'grad_norm': 8.691773414611816, 'learning_rate': 8.866666666666668e-05, 'epoch': 0.6}
{'loss': 0.5121, 'grad_norm': 9.720526695251465, 'learning_rate': 8.855555555555556e-05, 'epoch': 0.6}
{'loss': 0.3244, 'grad_norm': 11.505767822265625, 'learning_rate': 8.844444444444445e-05, 'epoch': 0.6}
{'loss': 0.416, 'grad_norm': 15.675796508789062, 'learning_rate': 8.833333333333333e-05, 'epoch': 0.61}
{'loss': 0.3911, 'grad_norm': 33.70933532714844, 'learning_rate': 8.822222222222223e-05, 'epoch': 0.61}
{'loss': 0.4375, 'grad_norm': 16.429153442382812, 'learning_rate': 8.811111111111111e-05, 'epoch': 0.61}
{'loss': 0.307, 'grad_norm': 11.342657089233398, 'learning_rate': 8.800000000000001e-05, 'epoch': 0.62}
{'loss': 0.3204, 'grad_norm': 13.880233764648438, 'learning_rate': 8.78888888888889e-05, 'epoch': 0.62}
{'loss': 0.3631, 'grad_norm': 7.644885063171387, 'learning_rate':

  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.344729483127594, 'eval_f1': 0.843819393042191, 'eval_runtime': 846.4905, 'eval_samples_per_second': 3.192, 'eval_steps_per_second': 0.1, 'epoch': 1.18}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.3692, 'grad_norm': 10.36331558227539, 'learning_rate': 6.655555555555555e-05, 'epoch': 1.19}
{'loss': 0.2332, 'grad_norm': 4.724290370941162, 'learning_rate': 6.644444444444444e-05, 'epoch': 1.19}
{'loss': 0.5197, 'grad_norm': 7.217120170593262, 'learning_rate': 6.633333333333334e-05, 'epoch': 1.19}
{'loss': 0.1892, 'grad_norm': 15.670944213867188, 'learning_rate': 6.622222222222224e-05, 'epoch': 1.2}
{'loss': 0.1368, 'grad_norm': 2.4594459533691406, 'learning_rate': 6.611111111111111e-05, 'epoch': 1.2}
{'loss': 0.2583, 'grad_norm': 15.292116165161133, 'learning_rate': 6.6e-05, 'epoch': 1.2}
{'loss': 0.14, 'grad_norm': 2.8132176399230957, 'learning_rate': 6.588888888888889e-05, 'epoch': 1.2}
{'loss': 0.426, 'grad_norm': 19.32234764099121, 'learning_rate': 6.577777777777779e-05, 'epoch': 1.21}
{'loss': 0.3453, 'grad_norm': 15.108917236328125, 'learning_rate': 6.566666666666666e-05, 'epoch': 1.21}
{'loss': 0.3208, 'grad_norm': 25.405073165893555, 'learning_rate': 6.55555555555

  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.342351496219635, 'eval_f1': 0.847520355292376, 'eval_runtime': 846.1859, 'eval_samples_per_second': 3.193, 'eval_steps_per_second': 0.1, 'epoch': 1.78}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.321, 'grad_norm': 7.37649393081665, 'learning_rate': 4.433333333333334e-05, 'epoch': 1.78}
{'loss': 0.4496, 'grad_norm': 20.31473159790039, 'learning_rate': 4.422222222222222e-05, 'epoch': 1.78}
{'loss': 0.1342, 'grad_norm': 7.104060649871826, 'learning_rate': 4.4111111111111114e-05, 'epoch': 1.78}
{'loss': 0.2194, 'grad_norm': 5.511039733886719, 'learning_rate': 4.4000000000000006e-05, 'epoch': 1.79}
{'loss': 0.2151, 'grad_norm': 5.674297332763672, 'learning_rate': 4.388888888888889e-05, 'epoch': 1.79}
{'loss': 0.3339, 'grad_norm': 12.573616981506348, 'learning_rate': 4.377777777777778e-05, 'epoch': 1.79}
{'loss': 0.2934, 'grad_norm': 5.158769607543945, 'learning_rate': 4.3666666666666666e-05, 'epoch': 1.8}
{'loss': 0.1412, 'grad_norm': 3.6084353923797607, 'learning_rate': 4.355555555555556e-05, 'epoch': 1.8}
{'loss': 0.3468, 'grad_norm': 6.407821178436279, 'learning_rate': 4.344444444444445e-05, 'epoch': 1.8}
{'loss': 0.2575, 'grad_norm': 16.069202423095703, 'learning_rate

  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.3583008348941803, 'eval_f1': 0.8534418948926721, 'eval_runtime': 846.5001, 'eval_samples_per_second': 3.192, 'eval_steps_per_second': 0.1, 'epoch': 2.37}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.5591, 'grad_norm': 15.300568580627441, 'learning_rate': 2.211111111111111e-05, 'epoch': 2.37}
{'loss': 0.0734, 'grad_norm': 4.083217620849609, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.37}
{'loss': 0.3302, 'grad_norm': 16.737276077270508, 'learning_rate': 2.188888888888889e-05, 'epoch': 2.38}
{'loss': 0.2069, 'grad_norm': 6.334962368011475, 'learning_rate': 2.177777777777778e-05, 'epoch': 2.38}
{'loss': 0.1315, 'grad_norm': 7.969024658203125, 'learning_rate': 2.1666666666666667e-05, 'epoch': 2.38}
{'loss': 0.3225, 'grad_norm': 9.325034141540527, 'learning_rate': 2.1555555555555555e-05, 'epoch': 2.38}
{'loss': 0.1641, 'grad_norm': 8.319960594177246, 'learning_rate': 2.1444444444444443e-05, 'epoch': 2.39}
{'loss': 0.0992, 'grad_norm': 4.730285167694092, 'learning_rate': 2.1333333333333335e-05, 'epoch': 2.39}
{'loss': 0.3109, 'grad_norm': 10.840936660766602, 'learning_rate': 2.1222222222222223e-05, 'epoch': 2.39}
{'loss': 0.3456, 'grad_norm': 15.430804252624512, 'lear

  0%|          | 0/85 [00:00<?, ?it/s]

{'eval_loss': 0.3354032337665558, 'eval_f1': 0.8578830495928942, 'eval_runtime': 846.784, 'eval_samples_per_second': 3.191, 'eval_steps_per_second': 0.1, 'epoch': 2.96}
{'train_runtime': 35128.4898, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.028, 'train_loss': 0.40187835543602707, 'epoch': 2.96}


TrainOutput(global_step=1000, training_loss=0.40187835543602707, metrics={'train_runtime': 35128.4898, 'train_samples_per_second': 0.911, 'train_steps_per_second': 0.028, 'total_flos': 5.144574020372398e+17, 'train_loss': 0.40187835543602707, 'epoch': 2.9585798816568047})

In [9]:
model.save_pretrained('fine-tuned model')

### Inference + Test

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import AutoPeftModelForSequenceClassification, PeftModel

base_model_name = 'taide/Llama3-TAIDE-LX-8B-Chat-Alpha1'
adapter_model_path = './fine-tuned model'
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name, 
    add_prefix_space=True, 
    cache_dir = '/HDD/model_cache'
)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    cache_dir = '/HDD/model_cache',
    num_labels=2
)

peft_model = PeftModel.from_pretrained(base_model, adapter_model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at taide/Llama3-TAIDE-LX-8B-Chat-Alpha1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#LABEL_1: 詐騙 #LABEL_2: 非詐騙
from transformers import pipeline
peft_model.eval()
classifier = pipeline(
    model=peft_model,
    tokenizer=tokenizer,
    task='sentiment-analysis',
    device=0,
)
classifier(work_dataset[0]['text'])

In [42]:
import json
from pathlib import Path
def get_evaluation_dataset(path:str)->dict:
    data: list[dict] = json.loads(Path(path).read_text(encoding='utf-8'))['thread']
    dataset = [{"text": datapoint['text'], "label": datapoint['label']} for datapoint in data]
    return dataset
work_dataset = get_evaluation_dataset('labeled_datasets/data_1.json')
gambling_dataset = get_evaluation_dataset('labeled_datasets/data_2.json')
dating_dataset = get_evaluation_dataset('labeled_datasets/data_3.json')
investment_dataset = get_evaluation_dataset('labeled_datasets/data_4.json')

In [44]:
from tqdm import tqdm
import json
from sklearn.metrics import f1_score
total_true_labels = []
total_predictions = []
def get_score(category: str, dataset):
    for i in tqdm(range(len(dataset))):
        prediction = classifier(dataset[i]['text'])[0]
        predicted_label = 1 if prediction['label'] == 'LABEL_1' else 0
        score = prediction['score']
        dataset[i]['prediction'] = predicted_label
        dataset[i]['score'] = score
    with open(f'test_result/test_result_{category}', 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    true_labels = [item['label'] for item in dataset]
    predictions = [item['prediction'] for item in dataset]
    total_true_labels.extend(true_labels)
    total_predictions.extend(predictions)
    f1 = f1_score(y_true=true_labels, y_pred=predictions)
    print(f'category: {category}, F1 score: {f1}')

get_score('work', work_dataset)
get_score('gambling', gambling_dataset)
get_score('dating', dating_dataset)
get_score('investment', investment_dataset)
print(f'total F1 {f1_score(y_true=total_true_labels, y_pred=total_predictions)}')

100%|██████████| 33/33 [00:05<00:00,  6.36it/s]


category: work, F1 score: 0.8421052631578947


100%|██████████| 47/47 [00:07<00:00,  6.71it/s]


category: gambling, F1 score: 0.6666666666666666


100%|██████████| 56/56 [00:05<00:00,  9.95it/s]


category: dating, F1 score: 0.6557377049180327


100%|██████████| 48/48 [00:08<00:00,  5.70it/s]

category: investment, F1 score: 0.8260869565217391
total F1 0.7307692307692307



