In [1]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold, KFold
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_distill")
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-14B-Instruct"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwen14b")

('/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer.json')

## Prepare Data

In [4]:
ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_final/data/ft48k_calibrated.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner), axis=1)

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=666)
for train_index, val_index in skf.split(ft, ft["language"]):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    break


47952 485




In [5]:
ppt = pd.read_parquet("/group-volume/binfeng/wsdm/stage_final/data/ppt196k_calibrated.parquet")
ppt.dropna(inplace=True)
ppt["text"] = ppt.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ppt["label"] = ppt.apply(lambda x: format_label(x.winner), axis=1)

kf = KFold(n_splits=100, shuffle=True, random_state=10)
for train_index, val_index in kf.split(ppt):
    ppt_train, ppt_val = ppt.iloc[train_index], ppt.iloc[val_index]
    print(len(ppt_train), len(ppt_val))
    break


194826 1968


## Dataloader

In [6]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )


ppt_train_dataset = Dataset.from_pandas(ppt_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ppt_val_dataset = Dataset.from_pandas(ppt_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
raw_dataset = DatasetDict({
    'ppt_train': ppt_train_dataset,
    'ppt_val': ppt_val_dataset,
    'ft_train': ft_train_dataset,
    'ft_val': ft_val_dataset,
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset


Map: 100%|██████████| 194826/194826 [02:02<00:00, 1587.99 examples/s]
Map: 100%|██████████| 1968/1968 [00:01<00:00, 1692.45 examples/s]
Map: 100%|██████████| 47952/47952 [00:32<00:00, 1459.06 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1446.77 examples/s]


DatasetDict({
    ppt_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 194826
    })
    ppt_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 1968
    })
    ft_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    ft_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [9]:
i = 0
print(tokenizer.decode(tokenized_dataset["ft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft_val"][i]["labels"])

<|User Prompt|>
I want to add a graph of the lowest price by time of a item by tier.
I want to add a graph of the quantities distribution over time.
Add more necessary graphs you think could enhance this dashboard.

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Eclesiar Market Visualization</title>
    <!-- Include Chart.js from CDN -->
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #charts { display: flex; flex-direction: column; gap: 40px; }
        .chart-container { width: 100%; }
        canvas { max-width: 100%; }
        .hidden { display: none; }
    </style>
</head>
<body>
    <h1>Eclesiar Market Visualization</h1>
    <div id="controls">
        <!-- Add file upload input here -->
        <label for="fileInput">Upload Market Log File:</label>
        <input type="file" id="fileInput" accept=".j

In [8]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwen14b_final")

Saving the dataset (1/4 shards):  25%|██▌       | 48707/194826 [00:00<00:00, 233150.56 examples/s]

Saving the dataset (4/4 shards): 100%|██████████| 194826/194826 [00:01<00:00, 102210.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1968/1968 [00:00<00:00, 99343.96 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 47952/47952 [00:00<00:00, 137040.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 48506.99 examples/s]
