In [1]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold, KFold
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_distill")
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-Coder-14B-Instruct"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwencd14b")

('/group-volume/binfeng/wsdm/tokenizer/qwencd14b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwencd14b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwencd14b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwencd14b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwencd14b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwencd14b/tokenizer.json')

## Prepare Data

In [5]:
ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_final/data/ft48k_calibrated.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner), axis=1)

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=66)
for train_index, val_index in skf.split(ft, ft["language"]):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    break


47952 485




In [6]:
ppt = pd.read_parquet("/group-volume/binfeng/wsdm/stage_final/data/ppt196k_calibrated.parquet")
ppt.dropna(inplace=True)
ppt["text"] = ppt.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ppt["label"] = ppt.apply(lambda x: format_label(x.winner), axis=1)

kf = KFold(n_splits=100, shuffle=True, random_state=10)
for train_index, val_index in kf.split(ppt):
    ppt_train, ppt_val = ppt.iloc[train_index], ppt.iloc[val_index]
    print(len(ppt_train), len(ppt_val))
    break


194826 1968


## Dataloader

In [7]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )


ppt_train_dataset = Dataset.from_pandas(ppt_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ppt_val_dataset = Dataset.from_pandas(ppt_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
raw_dataset = DatasetDict({
    'ppt_train': ppt_train_dataset,
    'ppt_val': ppt_val_dataset,
    'ft_train': ft_train_dataset,
    'ft_val': ft_val_dataset,
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset


Map: 100%|██████████| 194826/194826 [02:08<00:00, 1511.91 examples/s]
Map: 100%|██████████| 1968/1968 [00:01<00:00, 1550.72 examples/s]
Map: 100%|██████████| 47952/47952 [00:34<00:00, 1391.04 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1205.27 examples/s]


DatasetDict({
    ppt_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 194826
    })
    ppt_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 1968
    })
    ft_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    ft_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [8]:
i = 0
print(tokenizer.decode(tokenized_dataset["ft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft_val"][i]["labels"])

<|User Prompt|>
/* eslint-disable @typescript-eslint/no-unused-vars */
/* eslint-disable @typescript-eslint/no-explicit-any */
import { useEffect, useState } from "react";
import { FaCheck } from "react-icons/fa6";
import { useParams } from "react-router-dom";
import Modal from "~/components/Modal/Modal";
import useGet from "~/hooks/useGet";
import { usePatch } from "~/hooks/usePost";
import Categories from "~/models/Categories";
import CategoryAttribute from "~/models/CategoryAttribute";
import Manufacturer from "~/models/Manufacturer";
import Products from "~/models/Products";
import ValueAttribute from "~/models/ValueAttribute";

interface Attribute {
    attributeId: number; // The ID of the attribute
    value: string;       // The corresponding value for this attribute
}

interface FormData {
    name: string;
    price: number;
    discount: number;
    stock: number;
    hot: number;
    visibility: boolean;
    image: File | string | null;
    attributes: Attribute[];
}

expor

In [9]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwencd14b_final")

Saving the dataset (1/4 shards):  29%|██▊       | 55707/194826 [00:00<00:01, 130302.01 examples/s]

Saving the dataset (4/4 shards): 100%|██████████| 194826/194826 [00:01<00:00, 100477.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1968/1968 [00:00<00:00, 114220.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 47952/47952 [00:00<00:00, 112241.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 40593.01 examples/s]
