In [1]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold, KFold
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_qft")
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-14B-Instruct"
MAX_LENGTH = 4000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwen14b")

('/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer.json')

## Prepare Data

In [4]:
ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/ftre6k_calibrated.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH,
                                                reverse=True), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner, reverse=True), axis=1)


In [5]:
raw_dataset = DatasetDict({})

skf = StratifiedKFold(n_splits=40, shuffle=True, random_state=66)
for i, (train_index, val_index) in enumerate(skf.split(ft, ft["language"])):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    raw_dataset[f"ft_train"] = ft_train_dataset
    raw_dataset[f"ft_val"] = ft_val_dataset
    break




6407 165


In [6]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset


Map:   0%|          | 0/6407 [00:00<?, ? examples/s]

Map: 100%|██████████| 6407/6407 [01:03<00:00, 100.28 examples/s]
Map: 100%|██████████| 165/165 [00:01<00:00, 103.11 examples/s]


DatasetDict({
    ft_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 6407
    })
    ft_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 165
    })
})

In [8]:
i = 3
print(tokenizer.decode(tokenized_dataset["ft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft_val"][i]["labels"])

<|User Prompt|>
document.addEventListener('DOMContentLoaded', () => {
    const authSection = document.getElementById('authSection');
    const timetableSection = document.getElementById('timetableSection');
    const authForm = document.getElementById('authForm');
    const timetableForm = document.getElementById('timetableForm');
    const logoutButton = document.getElementById('logoutButton');
    const toggleAuthMode = document.getElementById('toggleAuthMode');
    const rememberMeCheckbox = document.getElementById('rememberMe');
    const morningTimetableBody = document.getElementById('morningTimetableBody');
    const afternoonTimetableBody = document.getElementById('afternoonTimetableBody');

    let isLoginMode = true;
    let tasks = JSON.parse(localStorage.getItem('tasks')) || [];

    // Tải các nhiệm vụ hiện có từ localStorage
    tasks.forEach(task => addTaskToTable(task));

    // Hiển thị hoặc ẩn các phần tử dựa trên trạng thái đăng nhập
    if (localStorage.getItem('log

In [12]:
i = 3
print(tokenizer.decode(tokenized_dataset["ft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft_val"][i]["labels"])

<|User Prompt|>
document.addEventListener('DOMContentLoaded', () => {
    const authSection = document.getElementById('authSection');
    const timetableSection = document.getElementById('timetableSection');
    const authForm = document.getElementById('authForm');
    const timetableForm = document.getElementById('timetableForm');
    const logoutButton = document.getElementById('logoutButton');
    const toggleAuthMode = document.getElementById('toggleAuthMode');
    const rememberMeCheckbox = document.getElementById('rememberMe');
    const morningTimetableBody = document.getElementById('morningTimetableBody');
    const afternoonTimetableBody = document.getElementById('afternoonTimetableBody');

    let isLoginMode = true;
    let tasks = JSON.parse(localStorage.getItem('tasks')) || [];

    // Tải các nhiệm vụ hiện có từ localStorage
    tasks.forEach(task => addTaskToTable(task));

    // Hiển thị hoặc ẩn các phần tử dựa trên trạng thái đăng nhập
    if (localStorage.getItem('log

In [9]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/stage_qft/dataset/tokenized_qwen14b_re")

Saving the dataset (0/1 shards):   0%|          | 0/6407 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 6407/6407 [00:01<00:00, 4567.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 165/165 [00:00<00:00, 1687.78 examples/s]
