In [1]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold, KFold
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_qft")
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "microsoft/phi-4"
MAX_LENGTH = 4000
MAX_PROMPT_LENGTH = 800

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/phi4")

('/group-volume/binfeng/wsdm/tokenizer/phi4/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/tokenizer.json')

## Prepare Data

In [4]:
ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/ft48k_calibrated.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH, reverse=True), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner, reverse=True), axis=1)


Token indices sequence length is longer than the specified maximum sequence length for this model (18891 > 16384). Running this sequence through the model will result in indexing errors


In [5]:
def switch_logits(x):
    return [x[1], x[0]]
ft["logits_qwencd_cali"] = ft["logits_qwencd_cali"].apply(switch_logits)
ft["logits_qwen32_cali"] = ft["logits_qwen32_cali"].apply(switch_logits)
ft["logits_qwen14_cali"] = ft["logits_qwen14_cali"].apply(switch_logits)

In [7]:
soft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/soft87k.parquet")
soft.dropna(inplace=True)
soft["text"] = soft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH, reverse=True), axis=1)
soft["label"] = soft.apply(lambda x: format_label(x.winner, reverse=True), axis=1)


In [8]:
soft = soft[~soft.prompt.isin(ft.prompt)]  # prevent leak

In [9]:
soft["logits_qwencd_cali"] = soft["logits_qwencd"]
soft["logits_qwen32_cali"] = soft["logits_qwen32"]
soft["logits_qwencd_cali"] = soft["logits_qwencd_cali"].apply(switch_logits)
soft["logits_qwen32_cali"] = soft["logits_qwen32_cali"].apply(switch_logits)
kf = KFold(n_splits=40, shuffle=True, random_state=33)
for train_index, val_index in kf.split(soft):
    soft_train, soft_val = soft.iloc[train_index], soft.iloc[val_index]
    print(len(soft_train), len(soft_val))
    break

85478 2192


In [10]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )

soft_train_dataset = Dataset.from_pandas(soft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
soft_val_dataset = Dataset.from_pandas(soft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
raw_dataset = DatasetDict({
    'soft_train':soft_train_dataset,
    'soft_val': soft_val_dataset
})

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=66)
for i, (train_index, val_index) in enumerate(skf.split(ft, ft["language"])):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    raw_dataset[f"ft_train_fold{i}"] = ft_train_dataset
    raw_dataset[f"ft_val_fold{i}"] = ft_val_dataset


tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset




38749 9688
38749 9688
38750 9687
38750 9687
38750 9687


Map: 100%|██████████| 85478/85478 [01:14<00:00, 1144.81 examples/s]
Map: 100%|██████████| 2192/2192 [00:01<00:00, 1140.80 examples/s]
Map: 100%|██████████| 38749/38749 [00:40<00:00, 968.00 examples/s] 
Map: 100%|██████████| 9688/9688 [00:10<00:00, 961.82 examples/s] 
Map: 100%|██████████| 38749/38749 [00:40<00:00, 959.02 examples/s] 
Map: 100%|██████████| 9688/9688 [00:10<00:00, 940.85 examples/s] 
Map: 100%|██████████| 38750/38750 [00:39<00:00, 992.12 examples/s] 
Map: 100%|██████████| 9687/9687 [00:09<00:00, 973.14 examples/s] 
Map: 100%|██████████| 38750/38750 [00:40<00:00, 957.31 examples/s] 
Map: 100%|██████████| 9687/9687 [00:10<00:00, 940.60 examples/s] 
Map: 100%|██████████| 38750/38750 [00:42<00:00, 913.84 examples/s] 
Map: 100%|██████████| 9687/9687 [00:10<00:00, 924.95 examples/s] 


DatasetDict({
    soft_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 85478
    })
    soft_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 2192
    })
    ft_train_fold0: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 38749
    })
    ft_val_fold0: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 9688
    })
    ft_train_fold1: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 38749
    })
    ft_val_fold1: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 9688
    })
    ft_

In [12]:
i = 3
print(tokenizer.decode(tokenized_dataset["soft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["soft_val"][i]["labels"])

<|User Prompt|>
i  have this following css and html i want the user-tooltip to be shown on top of everything because when it's shown is only shown inside the scrollable parent  participants

CSS:
.room-users {
  display: flex;
  flex-direction: column;
  height: 100%;
  .participants-area {
    display: flex;
    flex-direction: column;
    height: 100%;
    max-height: 50%;
    .participants {
      overflow: auto;
      display: flex;
      flex-direction: column;
      .participant {
        display: flex;
        align-items: center;
        justify-content: space-between;
        .left-section {
          gap: 5px;
          display: flex;
          align-items: center;
          .avatar {
            width: 46px;
            height: 46px;
            background-size: cover;
            background-position: center;
          }
        }
        .right-section {
          display: flex;
          align-items: center;
          gap: 10px;
          .user-menu {
            position:

In [13]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/stage_qft/dataset/tokenized_phi4")

Saving the dataset (4/4 shards): 100%|██████████| 85478/85478 [00:01<00:00, 62208.09 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 2192/2192 [00:00<00:00, 73822.39 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38749/38749 [00:00<00:00, 65884.75 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9688/9688 [00:00<00:00, 80940.54 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38749/38749 [00:00<00:00, 67010.46 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9688/9688 [00:00<00:00, 92402.40 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38750/38750 [00:00<00:00, 69202.53 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9687/9687 [00:00<00:00, 82269.23 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38750/38750 [00:00<00:00, 70427.69 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9687/9687 [00:00<00:00, 80708.24 examples/s]
Saving the dataset (2/2 shards): 10