In [7]:
import os
# Choose a GPU that no one is using.
# Run 'nvidia-smi' in terminal to see GPU status
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [8]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
model_name = 'taide/Llama3-TAIDE-LX-8B-Chat-Alpha1'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir='/HDD/model_cache/'
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left',
    cache_dir='/HDD/model_cache/'
)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
import json
from pathlib import Path
from datasets import Dataset

example_input = json.loads(Path('./labeled_datasets/example.json').read_text())['input']
example_output = json.loads(Path('./labeled_datasets/example.json').read_text())['output']

In [10]:
def get_prompt(text):
    instruction = f'''
    你是一個貼文審查專家，你的任務是判斷給定的貼文是否為詐騙，
    你只能根據輸入內容生成以下格式的單個 JSON：
    {{
        "text": "<貼文內容>",
        "label": "<標記結果 (0 或 1)>",
        "explination": "<標記理由>"
    }}
    除了上述 JSON 結果，請勿生成任何其他內容。
    輸入輸出參考以下範例:
    輸入: {example_input[0]}
    輸出: {example_output[0]}

    輸入: {example_input[1]}
    輸出: {example_output[1]}

    輸入: {example_input[2]}
    輸出: {example_output[2]}

    輸入: {example_input[3]}
    輸出: {example_output[3]}

    輸入: {example_input[4]}
    輸出: {example_output[4]}

    輸入: {example_input[5]}
    輸出: {example_output[5]}

    輸入: {example_input[6]}
    輸出: {example_output[6]}

    輸入: {example_input[7]}
    輸出: {example_output[7]}

    '''
    sys_prompt = f'{instruction}'
    user_prompt = f'輸入: {{"text": "{text}"}}'
    message = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
    ]
    prompt = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        max_new_tokens=128,
        min_new_tokens=64
    )
    return prompt

In [20]:
data: list[dict] = json.loads(Path(TEST_DATA_PATH).read_text(encoding='utf-8'))
data['emotional_datiing']

[{'text': '好像是欸昌董的車是流量密碼thumbsup',
  'category': 'emotional_datiing',
  'label': 1},
 {'text': '玻璃珠戒指創業日記流量密碼', 'category': 'emotional_datiing', 'label': 1},
 {'text': '您好社群密碼1997', 'category': 'emotional_datiing', 'label': 1},
 {'text': '不負責任翻譯womanraisinghand最近與2pm的成員們有碰面嗎hatchingchick啊我最近有點忙womanraisinghand澤演今天有來日本hatchingchick對我知道現在澤演在東京忙碌的行程中問說在哪裡能碰面我因為行程也忙人在橫濱grinningcat啊好遠全場大笑hatchingchick因為澤演grinningcat很忙所以碰不到面其他成員昨晚應該有碰面一起吃飯指IG我因為在這裡橫濱FM對成員們真的很抱歉因為我很忙全場womanraisinghand再次大笑只能說內大家加油Fignting最近成員們是真的很忙碌pandaJunk跟koala尼坤之前跟我一起舉行ENWJbanana燦盛嘛與我是最常見面的太常見面了全場笑翻是說banana都有你家大門密碼常常跑去餵食hatchingchick當然見面次數很多rollingonthefloorlaughing俊昊penguin真的很忙碌希望很快能一起與大家見面謝謝大家真的很溫暖影片來源httpsyoutubeorxMuzX3E7AsiKosjtKeuqwgn9df',
  'category': 'emotional_datiing',
  'label': 0},
 {'text': '信一0723覆機Call台機主密碼信一Call台十二少Call你話',
  'category': 'emotional_datiing',
  'label': 1},
 {'text': '哪裡找這種女友又大又圓的水分又充足密碼1223',
  'category': 'emotional_datiing',
  'label': 1},
 {'text': 'Chanel珍珠繁星項鍊全新chanelchanel項鍊香奈儿香奈兒流量密

In [21]:
import json
from pathlib import Path
TEST_DATA_PATH = ('dataset/test/label_test.json')

def get_test_dataset(path:str)->dict:
    data: list[dict] = json.loads(Path(path).read_text(encoding='utf-8'))
    work_dataset = [{
        "text": datapoint['text'], 
        "label": datapoint['label'],
        "prompt": get_prompt(datapoint['text'])
        } for datapoint in data['work']]
    gambling_datasetm = [{
        "text": datapoint['text'],
        "label": datapoint['label'],
        "prompt": get_prompt(datapoint['text'])
        } for datapoint in data['gamble']]
    dating_dataset = [{
        "text": datapoint['text'], 
        "label": datapoint['label'],
        "prompt": get_prompt(datapoint['text'])
        } for datapoint in data['emotional_datiing']]
    investment_dataset = [{
        "text": datapoint['text'], 
        "label": datapoint['label'],
        "prompt": get_prompt(datapoint['text'])
        } for datapoint in data['investment']]
    return work_dataset, gambling_datasetm, dating_dataset, investment_dataset

(
    work_dataset, 
    gambling_dataset,
    dating_dataset,
    investment_dataset
) = get_test_dataset(TEST_DATA_PATH)

# train_data = json.loads(Path('./dataset/combined_data_unique.json').read_text())
# train_dataset = Dataset.from_list(train_data)
# train_dataset

In [7]:
generator = pipeline(
    model=model, 
    tokenizer=tokenizer,
    task="text-generation",
    device=0,
    batch_size=32,
    max_new_tokens=512, 
    stop_strings="}",
    return_full_text=False,
)

In [16]:
from tqdm import tqdm
import re
from sklearn.metrics import f1_score


total_true_labels = []
total_predictions = []
def get_score(category: str, dataset):
    fail_num = 0 #record of broken output
    for i in tqdm(range(len(dataset))):
        outputs = generator(dataset[i]['prompt'], tokenizer = tokenizer)[0]['generated_text']
        dataset[i]['output'] = outputs
        matches = re.findall('(1|0)', outputs)
        try:
            if matches[0] == '1' or matches[0] == '0':
                dataset[i]['prediction'] = int(matches[0])
            else:
                dataset[i]['prediction'] = None
                fail_num = fail_num + 1
        except:
            dataset[i]['prediction'] = None
            fail_num = fail_num + 1
    true_labels = [item['label'] for item in dataset if item['prediction']]
    predictions = [item['prediction'] for item in dataset if item['prediction']]
    total_true_labels.extend(true_labels)
    total_predictions.extend(predictions)
    f1 = f1_score(y_true=true_labels, y_pred=predictions)
    with open(f'test_result/few_shot_test_result_{category}', 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    print(f'category: {category}, F1 score: {f1}, Failed case num: {fail_num}')
get_score('work', work_dataset)
get_score('gambling', gambling_dataset)
get_score('dating', dating_dataset)
get_score('investment', investment_dataset)
print(f'total F1 {f1_score(y_true=total_true_labels, y_pred=total_predictions)}')


100%|██████████| 33/33 [02:46<00:00,  5.06s/it]


category: work, F1 score: 0.9565217391304348, Failed case num: 2


100%|██████████| 47/47 [04:02<00:00,  5.16s/it]


category: gambling, F1 score: 0.8363636363636363, Failed case num: 2


100%|██████████| 56/56 [03:54<00:00,  4.19s/it]


category: dating, F1 score: 0.9361702127659575, Failed case num: 2


100%|██████████| 48/48 [04:13<00:00,  5.29s/it]

category: investment, F1 score: 0.8636363636363636, Failed case num: 1
total F1 0.8875739644970414





In [26]:
work_dataset

[{'text': '我們這邊有缺呦womanraisinghandlightskintone在家工作輕鬆賺飽飽',
  'label': 1,
  'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n你是一個貼文審查專家，你的任務是判斷給定的貼文是否為詐騙，\n    你只能根據輸入內容生成以下格式的單個 JSON：\n    {\n        "text": "<貼文內容>",\n        "label": "<標記結果 (0 或 1)>",\n        "explination": "<標記理由>"\n    }\n    除了上述 JSON 結果，請勿生成任何其他內容。\n    輸入輸出參考以下範例:\n    輸入: {\'text\': \'我們這邊有缺呦womanraisinghandlightskintone在家工作輕鬆賺飽飽\'}\n    輸出: {\'label\': 1, \'explination\': \'這則貼文利用吸引人的語句，承諾在家輕鬆賺錢，未提供具體職業資訊，常見於詐騙訊息以吸引不知情的受害者。\'}\n\n    輸入: {\'text\': \'Japaneseacceptablebutton不限平台私帶事由百家樂私帶群帶連倒五天噴了兩萬本金5000獲利32000內文放留言\'}\n    輸出: {\'label\': 1, \'explination\': \'貼文內容提及私帶百家樂群組並宣稱高額獲利，這是常見的詐騙模式，誘導用戶加入非法賭博或詐騙群組，沒有可信度。\'}\n\n    輸入: {\'text\': \'人很奇怪有時候很想找人聊天有時候又想安安靜靜的沉浸在自己的世界\'}\n    輸出: {\'label\': 0, \'explination\': \'這則貼文表達的是個人情感與日常心情分享，並無任何誘導性或詐騙特徵，屬於普通的社交貼文。\'}\n\n    輸入: {\'text\': \'能徵友文脆友單身來認識一下桃園新竹都能接受異地我83年次16053但希望有人追蹤我相互瞭解salutingface希望餘生有幸能多多指教要聊天哀居密我回覆喲facewithpeekingeye\'}\

In [7]:
# def prepare_prompt(datapoint):
#     text = datapoint['text']
#     query = f'輸入: {{"text": "{text}"}}'
#     message = [
#         {"role": "system", "content": sys},
#         {"role": "user", "content": query},
#     ]
#     prompt = tokenizer.apply_chat_template(
#         message,
#         tokenize=False,
#         max_new_tokens=128,
#         min_new_tokens=64
#     )
#     if prompt is None:
#         raise ValueError(f"Prompt generation failed for text: {text}")
#     datapoint['prompt'] = prompt
#     return datapoint

In [None]:
# from torch.utils.data import DataLoader
# train_dataset = train_dataset.map(prepare_prompt, batched=False, num_proc=32)
# dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)
# train_dataset

num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.


Map (num_proc=10):   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'category', 'prompt'],
    num_rows: 10
})

In [None]:
# from tqdm import tqdm
# result_dataset = {
#     'text': [],
#     'output': [],
# }

# for batch in tqdm(dataloader):
#     prompts = batch['prompt'] 
#     texts = batch['text']   
    
#     outputs = generator(prompts, tokenizer = tokenizer)
    
#     result_dataset['text'].extend(texts)
#     result_dataset['output'].extend(outputs)

100%|██████████| 1/1 [00:16<00:00, 16.55s/it]


In [None]:
# len(result_dataset['output'])

10

In [None]:
# import pandas as pd
# pd.DataFrame(result_dataset).to_csv('dataset1.csv')

In [None]:
# import pandas as pd
# labeled_data = pd.read_csv('dataset.csv').drop(['Unnamed: 0'], axis=1)
# labeled_data

Unnamed: 0,text,output
0,流量密碼,"[{'generated_text': ""輸出：{'label': 0, 'explinat..."
1,19歲知道男友手機密碼很有安全感29歲戶頭有200萬很有安全感,"[{'generated_text': '輸出：\n{\n""label"": 1,\n""exp..."
2,好像是欸昌董的車是流量密碼thumbsup,[{'generated_text': '我這邊就是把輸入的JSON格式的字典，對輸入的「t...
3,例如什麼讓我側耳傾聽redheart流量密碼在哪裡,"[{'generated_text': ""輸出：{'label': 1, 'explinat..."
4,查崗手機拿來洛燁漫不經心地將手機放在他手心上密碼我生日算了我挺相信你的說罷便將手機還給了他你...,"[{'generated_text': ""輸出：\n{'label': 0, 'explin..."
...,...,...
15842,好好工作賺錢吧,[{'generated_text': '這個輸入沒有形成一個有效的問題，因此無法根據該輸入...
15843,國一大安高工or長庚護專回頭看覺得國一志願設這兩個超不適合自己國二大安高工師大附中許願讀的高...,"[{'generated_text': '輸出：\n{\n""label"": 0,\n""exp..."
15844,中粉都不用賺錢不用上班嗎,"[{'generated_text': ""輸出: {'label': 1, 'explina..."
15845,原來可是我也疑惑嫌犯偷到手機怎麼沒登入哈哈,"[{'generated_text': ""輸出：{'label': 0, 'explinat..."


In [None]:
# import re
# def get_label(output):
#     label_pattern = r"(label.*(1|0).*,)"
#     match = re.search(label_pattern, output)
#     return int(match.group(2))
# def get_explination(output):
#     label_pattern = r"(explination.*:([^}\"]*))"
#     match = re.search(label_pattern, output)
#     return match.group(2).strip().replace("'",'')

In [None]:
# import json
# import re
# success_extract = []
# failed_extract = []
# for text, output in zip(labeled_data['text'], labeled_data['output']):
#     try:
#         label = get_label(output)
#         explination = get_explination(output)
#         success_extract.append({
#             "text": text,
#             "label": label,
#             "explination": explination
#         })
#     except:
#         failed_extract.append({
#             "text": text,
#             "output": output
#         })

In [92]:
# from pathlib import Path
# output_path = Path('./dataset/training.json')
# with output_path.open('w', encoding='utf-8') as f:
#     json.dump(success_extract, f, ensure_ascii=False, indent=4)