In [None]:
# from unsloth import FastLanguageModel
# from ollama import chat
from datasets import Dataset

In [None]:
import re
import datetime
import itertools
import os
import pandas as pd
import numpy as np

from ast import literal_eval

# import src.inference as inference
# from src.utils import run

In [None]:
# create data for bert

# s_aduc_cd_fd = r'(?<=\[SENTENCE\]: )(.*?)(?=\n\[FULL TEXT\]:)'
# for ed s = r'(?<=\[ARGUMENT\]: )(.*?)(?=\n\[SENTENCE\]:)' if [ARGUMENT] is needed
# for et s = r'(?<=\[CLAIM\]: )(.*?)(?=\n\[SENTENCE\]:)' if [CLAIM] is needed
# s_ed_et_aqa_sd = r'(?<=\[SENTENCE\]: )(.*)'
# fd s = # r'(?<=\[SENTENCE\]: )(.*?)(?=\n\[FULL TEXT\]:)',

def get_regex(task:str):
    regex  = {
        'aduc':  r'(?<=\[SENTENCE\]: )(.*?)(?=\n\[FULL TEXT\]:)',
        'claim_detection':  r'(?<=\[SENTENCE\]: )(.*?)(?=\n\[FULL TEXT\]:)',
        'evidence_detection': r'(?<=\[SENTENCE\]: )(.*)',
        'evidence_type': r'(?<=\[SENTENCE\]: )(.*)',
        'fallacies': r'(?s)(?<=\[SENTENCE\]: )(.*?)(?=\n\[FULL TEXT\]:)',
        'quality': r'(?<=\[SENTENCE\]: )(.*)',
        'relation': {
            'src': r'(?<=\[SOURCE\]: )(.*?)(?=\n\[TARGET\]:)',
            'trg': r'(?<=\[TARGET\]: )(.*)'
        },
        'stance_detection': r'(?<=\[SENTENCE\]: )(.*)',
    }
    return regex.get(task)

def parse_sentence(x, s):
    conv = x['conversations']
    match = re.search(s, conv[1].get('content'))
    if match:
        return match.group()
    else:
        print(f'Error: {match}, {conv[1].get("content")}, {s}')
        raise ValueError

def create_data_bert(task_name:str, path_src:str, path_trg:str):
    converter = {'conversations': literal_eval, 'answer': literal_eval}
    # s = r'^\./[^/]+/([^/]+)/'
    for root, dirs, files in os.walk(path_src):
        for file in files:
            if 'labels' not in file:
                f_path = os.path.join(root, file)
                df = pd.read_csv(
                    f_path,
                    converters=converter
                )
                task_regex = get_regex(task_name)
                if task_name != 'relation':
                    parse_data = df.apply(
                        lambda x: parse_sentence(x, task_regex),
                        axis=1,
                    )
                    df['conversations'] = parse_data
                else:
                    src_data = df.apply(
                        lambda x: parse_sentence(x, task_regex.get('src')),
                        axis=1,
                    )
                    trg_data = df.apply(
                        lambda x: parse_sentence(x, task_regex.get('trg')),
                        axis=1,
                    )
                    df['conversations'] = src_data
                    df['trg'] = trg_data
                df.to_csv(f'{path_trg}/{file}', index=False)

def parse_data(path:str):
    for root, dirs, files in os.walk(path):
        for dir in dirs:
            if 'mt_ft' not in dir:
                create_data_bert(dir, os.path.join(root,dir), f'spl_bert/{dir}')
parse_data('./sampled_data')

In [None]:
from transformers import AutoTokenizer 
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from datasets import Dataset

def get_bert_data(task_name:str):
    train_data = pd.read_csv(f'./spl_bert/{task_name}/spl2_train.csv')
    test_data = pd.read_csv(f'./spl_bert/{task_name}/spl2_test.csv')
    if task_name != 'relation':
        train_data = train_data[['conversations', 'single_ans']]
        test_data = test_data[['conversations', 'single_ans']]
        train_data = train_data.rename(
            columns={'conversations': 'text', 'single_ans': 'label'}
        )
        test_data = test_data.rename(
            columns={'conversations': 'text', 'single_ans': 'label'}
        )
    else:
        train_data = train_data[['conversations', 'trg', 'single_ans']]
        test_data = test_data[['conversations','trg', 'single_ans']]
        train_data = train_data.rename(
            {'conversations': 'text_a', 'trg': 'text_b', 'single_ans': 'label'}
        )
        test_data = test_data.rename(
            {'conversations': 'text_a', 'trg': 'text_b', 'single_ans': 'label'}
        )
    return train_data, test_data

def tokenize_fn(batch, tokenizer):
    return tokenizer(
        batch['text'],
        padding='max_length',
        truncation=True,
        max_length=1024,
    )

def get_datasets(tokenizer, tokenize_fn, task_name:str):
    train_data, test_data = get_bert_data(task_name)
    dataset = [
        Dataset.from_pandas(train_data),
        Dataset.from_pandas(test_data),
    ]
    for i, data in enumerate(dataset):
        data = data.map(lambda x: tokenize_fn(x, tokenizer), batched=True).class_encode_column(column='label')
        data = data.remove_columns(['text'])
        data.set_format('torch')
        dataset[i] = data
    train_dataset = dataset[0]
    test_dataset = dataset[1]
    return train_dataset, test_dataset

def compute_metrics(pred):
    label = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(label, preds, average='weighted')
    acc = accuracy_score(label, preds)
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'acc': acc,
    }

# task_list=[
#         'aduc',
#         'claim_detection',
#         'evidence_detection',
#         'evidence_type',
#         'fallacies',
#         'quality',
#         # 'relation',
#         'stance_detection'
#     ]

# for t in task_list:
#     print(t)
#     train, test = get_bert_data(t)
#     if train['text'].isnull().any():
#         print(t, 'train')
#     if test['text'].isnull().any():
#         print(t, 'test')

# train, test = get_bert_data('claim_detection')

In [None]:
# bert training
# TODO : redo sample for claim detection and relation before training and test bert
task_name = 'quality'
output_dir = f'./outputs/{task_name}/deberta_{task_name}'
model_name = 'microsoft/deberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
)

train_dataset, test_dataset = get_datasets(tokenizer, tokenize_fn, task_name)

# raise ValueError
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="no",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    learning_rate=3e-5,
    optim="adamw_torch_fused",
    weight_decay=0.01,
    logging_steps=1,
    report_to="tensorboard",
    save_strategy="epoch",
    # load_best_model_at_end=True,
    # eval_on_start=True,
    remove_unused_columns=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Test the model
test_results = trainer.evaluate(test_dataset)
print(test_results)
test_results
test_df = pd.Series(data=test_results)
test_df.to_csv(f'./test_res/{task_name}/deberta_res_{task_name}.csv', header=['score'])

In [None]:
def get_all_f1(all_result:dict):
    d = {}
    for task, scores in all_result.items():
        if task != 'fallacies':
            d.update({task: scores.get('score_all_data')[0]})
        else:
            d.update({'fallacies_single': scores[0].get('score_all_data')[0]})
            d.update({'fallacies_multi': scores[1].get('score_all_data')[0]})
    return d

In [None]:
# Run Training, Test and saving 
# model, tokenizer = run('mt_ft', do_training=True)

In [None]:
# all_result = inference.inference_on_all_data(
#     model_for_task='aduc',
#     model_to_use='fine-tuned'
# )
# all_result

In [None]:
# model.save_pretrained_gguf(
#     './gguf_model/merged_gguf',
#     tokenizer,
#     quantization_method = "q8_0", # choose quant method: q8_0
# )

### Thing kept just in case

In [None]:
# Temp cell 
import numpy as np
# (F1, Precision, Recall)
tmp = {
    'aduc': {
        "['Premise']": (np.float64(0.9090909090909091), np.float64(0.8333333333333334),np.float64(1.0)),
        "['Claim']": (np.float64(0.888888888888889), np.float64(1.0), np.float64(0.8)),
        'score_all_data': (np.float64(0.9), np.float64(0.9), np.float64(0.9))
    },
    'claim_detection': {
        "['Non-claim']": (np.float64(0.8571428571428571),np.float64(0.75), np.float64(1.0)),
        "['Claim']": (np.float64(0.6666666666666666), np.float64(1.0), np.float64(0.5)),
        'score_all_data': (np.float64(0.8000000000000002), np.float64(0.8), np.float64(0.8))
    },
     'evidence_detection': {
        "['Evidence']": (np.float64(0.6666666666666665), np.float64(0.75), np.float64(0.6)),
        "['Non-evidence']": (np.float64(0.7272727272727272), np.float64(0.6666666666666666), np.float64(0.8)),
        'score_all_data': (np.float64(0.7), np.float64(0.7), np.float64(0.7))
    },
    'evidence_type': {
        "['STUDY']": (0, 0, 0),
        "['ANECDOTAL']": (np.float64(0.4),np.float64(0.5), np.float64(0.3333333333333333)),
        "['NONE']": (0, 0, 0), "['EXPLANATION']": (0, 0, 0),
        'score_all_data': (np.float64(0.125), np.float64(0.16666666666666666), np.float64(0.1))},
    'fallacies': (
        {
            "['AWP']": (np.float64(1.0), np.float64(1.0), np.float64(1.0)),
            "['AA']": (0, 0, 0),
            "['AR']": (0, 0, 0),
            'score_all_data': (np.float64(0.5), np.float64(1.0), np.float64(0.3333333333333333))},
        {
            "['AN', 'FA', 'EQ', 'AT']": (0, 0, 0.0),
            "['FA', 'STM']": (np.float64(0.6666666666666666), np.float64(1.0), 0.5),
            "['FC', 'HG']": (0, 0, 0.0),
            "['COS', 'GA', 'HG']": (0, 0, 0.0),
            "['FC', 'GA', 'AH']": (0, 0, 0.0),
            'score_all_data': (np.float64(0.19047619047619047),np.float64(0.2857142857142857),np.float64(0.14285714285714285))
        }
    ),
    'quality': {
        "['Average']": (0, 0, 0),
        "['Low']": (np.float64(0.6), np.float64(0.5), np.float64(0.75)),
        'score_all_data': (np.float64(0.37499999999999994), np.float64(0.5), np.float64(0.3))
    },
    'relation': {
        "['no relation']": (np.float64(0.4000000000000001), np.float64(0.4), np.float64(0.4)),
        "['support']": (np.float64(0.3333333333333333), np.float64(0.5), np.float64(0.25)),
        "['attack']": (np.float64(0.5), np.float64(0.3333333333333333), np.float64(1.0)),
        'score_all_data': (np.float64(0.4000000000000001), np.float64(0.4), np.float64(0.4))
    },
    'stance_detection': {
        "['Against']": (np.float64(0.5454545454545454), np.float64(0.6), np.float64(0.5)),
        "['For']": (np.float64(0.6666666666666666), np.float64(1.0), np.float64(0.5)),
        'score_all_data': (np.float64(0.588235294117647), np.float64(0.7142857142857143), np.float64(0.5))
    }
}

In [None]:
# Thing for sampling examples in report

# load sample for cd
train_spl_cd = pd.read_csv('./sampled_data/claim_detection/spl2_train.csv')
val_spl_cd = pd.read_csv('./sampled_data/claim_detection/spl2_val.csv')
test_spl_cd = pd.read_csv('./sampled_data/claim_detection/spl2_test.csv')

# slice iam_claim
train_iam_claim  = train_spl_cd[train_spl_cd['datasets'] == 'iam_claim']
train_iam_claim_cl = train_iam_claim[train_iam_claim['single_ans'] == 'Claim']
train_iam_claim_nn_cl = train_iam_claim[
    train_iam_claim['single_ans'] == 'Non-claim'
]

val_iam_claim = val_spl_cd[val_spl_cd['datasets'] == 'iam_claim']
val_iam_claim_cl = val_iam_claim[val_iam_claim['single_ans'] == 'Claim']
val_iam_claim_nn_cl = val_iam_claim[val_iam_claim['single_ans'] == 'Non-claim']

test_iam_claim = test_spl_cd[test_spl_cd['datasets'] == 'iam_claim']
test_iam_claim_cl = test_iam_claim[test_iam_claim['single_ans'] == 'Claim']
test_iam_claim_nn_cl = test_iam_claim[
    test_iam_claim['single_ans'] == 'Non-claim'
]
# slice ibm_claim
train_ibm_claim = train_spl_cd[train_spl_cd['datasets'] == 'ibm_claim']
train_ibm_claim_cl = train_ibm_claim[train_ibm_claim['single_ans'] == 'Claim']
train_ibm_claim_nn_cl = train_ibm_claim[
    train_ibm_claim['single_ans'] == 'Non-claim'
]

val_ibm_claim = val_spl_cd[val_spl_cd['datasets'] == 'ibm_claim']
val_ibm_claim_cl = val_ibm_claim[val_ibm_claim['single_ans'] == 'Claim']
val_ibm_claim_nn_cl = val_ibm_claim[val_ibm_claim['single_ans'] == 'Non-claim']

test_ibm_claim = test_spl_cd[test_spl_cd['datasets'] == 'ibm_claim'] 
test_ibm_claim_cl = test_ibm_claim[test_ibm_claim['single_ans'] == 'Claim']
test_ibm_claim_nn_cl = test_ibm_claim[
    test_ibm_claim['single_ans'] == 'Non-claim'
]
#slice ibm_args
train_ibm_args = train_spl_cd[train_spl_cd['datasets'] == 'ibm_args']
train_ibm_args_cl = train_ibm_args[train_ibm_args['single_ans'] == 'Claim']
train_ibm_args_nn_cl = train_ibm_args[
    train_ibm_args['single_ans'] == 'Non-claim'
]

val_ibm_args = val_spl_cd[val_spl_cd['datasets'] == 'ibm_args']
val_ibm_args_cl = val_ibm_args[val_ibm_args['single_ans'] == 'Claim']
val_ibm_args_nn_cl = val_ibm_args[val_ibm_args['single_ans'] == 'Non-claim']

test_ibm_args = test_spl_cd[test_spl_cd['datasets'] == 'ibm_args']
test_ibm_args_cl = test_ibm_args[test_ibm_args['single_ans'] == 'Claim']
test_ibm_args_nn_cl = test_ibm_args[test_ibm_args['single_ans'] == 'Non-claim']

# Sanity Check
# for iam claim
assert(len(train_iam_claim) == (len(train_iam_claim_cl) + len(train_iam_claim_nn_cl)))
assert(len(val_iam_claim) == (len(val_iam_claim_cl) + len(val_iam_claim_nn_cl)))
assert(len(test_iam_claim) == (len(test_iam_claim_cl) + len(test_iam_claim_nn_cl)))
# for ibm claim
assert(len(train_ibm_claim) == (len(train_ibm_claim_cl) + len(train_ibm_claim_nn_cl)))
assert(len(val_ibm_claim) == (len(val_ibm_claim_cl) + len(val_ibm_claim_nn_cl)))
assert(len(test_ibm_claim) == (len(test_ibm_claim_cl) + len(test_ibm_claim_nn_cl)))
# for ibm args
assert(len(train_ibm_args) == (len(train_ibm_args_cl) + len(train_ibm_args_nn_cl)))
assert(len(val_ibm_args) == (len(val_ibm_args_cl) + len(val_ibm_args_nn_cl)))
assert(len(test_ibm_args) == (len(test_ibm_args_cl) + len(test_ibm_args_nn_cl)))

# Display
print(f'Iam Claim')
print(f'Total Train: {len(train_iam_claim)} | Total Val: {len(val_iam_claim)} | Total Test: {len(test_iam_claim)}')
print(f'Train Claim: {len(train_iam_claim_cl)} | Val Claim: {len(val_iam_claim_cl)} | Test Claim: {len(test_iam_claim_cl)}')
print(f'Train Non-claim: {len(train_iam_claim_nn_cl)} | Val Non-claim: {len(val_iam_claim_nn_cl)} | Test Non-claim: {len(test_iam_claim_nn_cl)}')
print(f'Ibm Claim')
print(f'Total Train: {len(train_ibm_claim)} | Total Val: {len(val_ibm_claim)} | Total Test: {len(test_ibm_claim)}')
print(f'Train Claim: {len(train_ibm_claim_cl)} | Val Claim: {len(val_ibm_claim_cl)} | Test Claim: {len(test_ibm_claim_cl)}')
print(f'Train Non-claim: {len(train_ibm_claim_nn_cl)} | Val Non-claim: {len(val_ibm_claim_nn_cl)} | Test Non-claim: {len(test_ibm_claim_nn_cl)}')
print(f'Ibm args')
print(f'Total Train: {len(train_ibm_args)} | Total Val: {len(val_ibm_args)} | Total Test: {len(test_ibm_args)}')
print(f'Train Claim: {len(train_ibm_args_cl)} | Val Claim: {len(val_ibm_args_cl)} | Test Claim: {len(test_ibm_args_cl)}')
print(f'Train Non-claim: {len(train_ibm_args_nn_cl)} | Val Non-claim: {len(val_ibm_args_nn_cl)} | Test Non-claim: {len(test_ibm_args_nn_cl)}')  

In [None]:
# Merge all sample into 3 distinct set : Train, Validation and Test
import os
path_to_sampled_data = './sampled_data'
def load_all_sampled_csv(path_sample_dir):
    train = []
    val = []
    test = []
    for root, dir, files in os.walk(path_sample_dir):
        for file in files:
            if 'labels' not in file:
                if 'train' in file:
                    df = pd.read_csv(os.path.join(root, file))
                    train.append(df)
                if 'val' in file:
                    df = pd.read_csv(os.path.join(root, file))
                    val.append(df)
                if 'test' in file:
                    df = pd.read_csv(os.path.join(root, file))
                    test.append(df)
    all_train = pd.concat(train)
    all_val = pd.concat(val)
    all_test = pd.concat(test)
    return all_train, all_val, all_test

train, val, test = load_all_sampled_csv(path_to_sampled_data)

In [None]:
# To csv the 3 distinct set
train.to_csv('./sampled_data/all_spl_data/spl2_train.csv', index=False)
val.to_csv('./sampled_data/all_spl_data/spl2_val.csv', index=False)
test.to_csv('./sampled_data/all_spl_data/spl2_test.csv', index=False)

In [None]:
# test cell for ollama all data inference
# time = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
# s_file = get_savefile(
#     task_name='aduc',
#     spl_name='spl2',
#     m_name='Meta-Llama-3.1-8B-Instruct',
#     n_sample=4000,
#     epoch=2,
#     train_resp=f'_ollama_{model_task_name}',
#     outputs_dir=f'./outputs/test_aduc',
#     time=time
# )
# print(f'##### Load Data')
# labels, tr_d, val_d, test_d, change_lbl = get_data_for_task(
#     task_name='aduc',
#     s_file=s_file,
# )
# res = []
# s = '<[|]ANSWER[|]>'
# names_dataset = test_d['datasets']
# true_labels = test_d['answer']
# print(f'##### Start Inference')
# for i in test_d['conversations'][:2]:
#     print(i)
#     # tmp = {'role': 'user', 'content': i[0].get('content') +'\n' + i[1].get('content')}
#     # print(tmp)
#     response = chat(model='unsloth_model', messages=i)
#     print(response['message']['content'])
#     print(f'###############################')

In [None]:
# Load Checkpoint : Parameters\n",
# task = 'claim_detection'
# task_data = 'claim_detection'
# task_title = 'Claim Detection'
# ckpt_name = checkpoint.get(task)
# gguf_file = f'./gguf_model/{task}'
# max_seq_lenght = 2048
# dtype = None
# load_in_4bit = True
# gpu_mem_use = 0.6
# system_prompt = d_sys_prt.get(task_data)
# chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

# {SYSTEM}<|eot_id|><|start_header_id|>user<|end_header_id|>

# {INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# {OUTPUT}<|eot_id|>"""
# time = datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
# s_file = get_savefile(
#     task_name=task_data,
#     spl_name='spl2',
#     m_name='Meta-Llama-3.1-8B-Instruct',
#     n_sample=4000,
#     epoch=2,
#     train_resp=f'_train_resp',
#     outputs_dir=f'./outputs/test_{task}',
#     time=time
# )
# match task_data:
#     case 'aduc':
#         labels, tr_d, val_d, test_d = aduc.get_data(s_file)
#         change_lbl = aduc.change_lbl
#     case 'claim_detection':
#         labels, tr_d, val_d, test_d = cd.get_data(s_file)
#         change_lbl = cd.change_lbl
#     case 'evidence_detection':
#         labels, tr_d, val_d, test_d = ed.get_data(s_file)
#         change_lbl = ed.change_lbl
#     case 'evidence_type':
#         labels, tr_d, val_d, test_d = et.get_data(s_file)
#         change_lbl = et.change_lbl
#     case 'fallacies':
#         labels, tr_d, val_d, test_d = fd.get_data(s_file)
#         change_lbl = fd.change_lbl
#     case 'relation':
#         labels, tr_d, val_d, test_d = arc.get_data(s_file)
#         change_lbl = arc.change_lbl
#     case 'stance_detection':
#         labels, tr_d, val_d, test_d = sd.get_data(s_file)
#         change_lbl = sd.change_lbl
#     case 'quality':
#         labels, tr_d, val_d, test_d = aq.get_data(s_file)
#         change_lbl = aq.change_lbl
# # labels, tr_d, val_d, test_d = get_data(s_file)
# print(system_prompt)
# print(ckpt_name)
# print(gguf_file)
# print(change_lbl)


In [None]:
# Load Checkpoint : Load Model
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name=ckpt_name,
#     max_seq_length=max_seq_lenght,
#     dtype=dtype,
#     load_in_4bit=load_in_4bit,
#     fast_inference=True,
#     gpu_memory_utilization=gpu_mem_use
# )

In [None]:
# Load Checkpoint : Get Datasets
# train_set, val_set, test_set = get_datasets(
#     tokenizer=tokenizer,
#     train=tr_d,
#     val=val_d,
#     test=test_d, 
#     chat_template=chat_template,
#     sys_prt=system_prompt
# )

In [None]:
# Test on fewer data
# from datasets import Dataset
# tmp_set = test_set[:10]
# tmp_set = pd.DataFrame().from_records(tmp_set)
# tmp_set = Dataset.from_pandas(tmp_set)
# tmp_set

In [None]:
# Load Checkpoint : Run On test_set
# from src.training import test
# res_test_chpt = test(
#     model=model,
#     tokenizer=tokenizer,
#     data_test=test_set,
#     labels=labels,
# )
# metric, metric_m = get_metrics(change_lbl, res_test_chpt)
# metric, _ = get_metrics(change_lbl, res_test_chpt, is_multi_lbl=False)
# plot_metric(
#     metric=metric,
    # title=f'{task_title}: Scores ckpt{nbckpt}',
    # file_plot=f'./img/{task}/scores_ckpt{nbckpt}_metric_single.png',
    # file_metric=f'./test_res/{task}/scores_ckpt{nbckpt}_metric_single.csv'
# )
# plot_metric(
#     metric=metric_m
# )


In [None]:
# model.save_pretrained_merged(
#     './saved_model/claim_detection',
#     tokenizer,
#     save_method="merged_16bit",
# )

In [None]:
# model.save_pretrained_merged(
#     './saved_model/claim_detection/saved_lora',
#     tokenizer,
#     save_method="lora",
# )

In [None]:
# model.save_pretrained_gguf(
#     gguf_file,
#     tokenizer,
#     quantization_method = "q8_0", # choose quant method: q8_0
# )

In [None]:
# from ollama import chat

# res = []
# s = '<[|]ANSWER[|]>'
# quant = 'q8_0'
# names_dataset = test_d['datasets']
# true_labels = test_d['answer']
# # print(len(test_d['conversations']))
# for i in test_d['conversations']:
#     response = chat(model='unsloth_model', messages=[i[1]])
#     print(response['message']['content'])
#     tmp = re.split(s, response['message']['content'])
#     res.append(tmp[1])
# d_res = {'names': names_dataset, 'pred': res, 'lbl': true_labels}
# df_res = pd.DataFrame(data=d_res)
# df_res.to_csv(f'./test_res/{task}/test_result_gguf_{quant}.csv', index=False)

In [None]:
# metric_single, metric_multi = get_metrics(change_lbl, df_res)
# metric_single, _ = get_metrics(change_lbl, df_res, is_multi_lbl=False)
# plot_metric(
#     metric=metric_single,
    # title=f'{task_title}: Score gguf_{quant} single label',
    # file_plot=f'./img/{task}/scores_gguf_{quant}_metric_single.png',
    # file_metric=f'./test_res/{task}/scores_gguf_{quant}_metric_single.csv'
# )
# For Fallacies Task Only
# plot_metric(
#     metric=metric_multi,
#     title=f'{task_title}: Score gguf_{quant} multi label',
#     file_plot=f'./img/{task}/scores_gguf_{quant}_metric_multi.png',
#     file_metric=f'./test_res/{task}/scores_gguf_{quant}_metric_multi.csv'
# )

In [None]:
# Gen via VLLM
# def gen(txt, model, sampling_params):
#     output = model.fast_generate(
#         txt,
#         sampling_params = sampling_params,
#     )[0].outputs[0].text
    
#     return output

# def format_output(answer: str, fallacies: set) -> list:
#     s = '<[|]ANSWER[|]>'
#     tmp = re.split(s, answer)
#     pred= [i for i in tmp if i in fallacies]
#     return pred

# def zero_shot_gen(data: list[str], model, fallacies: set, sampling_params) -> list:
#     res = []
#     for i in data:
#         out = gen(i, model, sampling_params)
#         pred = format_output(out, fallacies)
#         res.append(pred)
#     return res

In [None]:
# class custom_validation_callback(TrainerCallback):
#     def __init__(self, data, sampling_params, fallacies, n_step=10):
#         super().__init__()
#         self.val_dataset = data
#         self.sampling_params = sampling_params
#         self.fallacies = fallacies
#         self.n_step=n_step
#     def on_step_end(self, args, state, control, **kwargs):
#         if state.global_step % self.n_step == 0 and state.global_step > 0 :
#             model.save_lora('sft_save_lora')
#             FastLanguageModel.for_inference(model)
#             pred = zero_shot_gen(
#                 data=self.val_dataset['text'],
#                 model=model,
#                 fallacies=self.fallacies,
#                 sampling_params=self.sampling_params
#             )
#             tmp_pred = [i if i != [] else ['Failed'] for i in pred]
#             d = pd.DataFrame().from_records(tmp_pred)
#             d['truth_label'] = self.val_dataset['answer']
#             d['step'] = np.full((len(d['truth_label']),), state.global_step)
#             try:
#                 d.to_csv(
#                     './validation_res.csv',
#                     index=False,
#                     mode='a',
#                     header=['pred', 'truth_label', 'step']
#                 )
#             except FileNotFoundError:
#                 d.to_csv('./validation_res.csv', index=False, header=['pred', 'truth_label'])
#         return super().on_step_end(args, state, control, **kwargs)

# class custom_test_callback(TrainerCallback):
#     def __init__(self, data, sampling_params, fallacies):
#         super().__init__()
#         self.test_dataset = data
#         self.sampling_params = sampling_params
#         self.fallacies = fallacies
#     def on_train_end(self, args, state, control, **kwargs):
#         model.save_lora('sft_save_lora')
#         FastLanguageModel.for_inference(model)
#         pred = zero_shot_gen(
#             data=self.test_dataset['text'],
#             model=model,
#             fallacies=self.fallacies,
#             sampling_params=self.sampling_params
#         )
#         tmp_pred = [i if i != [] else ['Failed'] for i in pred]
#         d = pd.DataFrame().from_records(tmp_pred)
#         d['truth_label'] = self.test_dataset['answer']
#         try:
#             d.to_csv('./test_res.csv', index=False, mode='a', header=['pred','truth_label'])
#         except FileNotFoundError:
#             d.to_csv('./test_res.csv', index=False, header=['pred', 'truth_label'])
#         return super().on_train_end(args, state, control, **kwargs)