In [2]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold, KFold
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_qft")
from utils import *


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-32B-Instruct"
MAX_LENGTH = 4000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwen32b")

('/group-volume/binfeng/wsdm/tokenizer/qwen32b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/tokenizer.json')

## Prepare Data

In [5]:
ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/ft48k_calibrated.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner), axis=1)


In [4]:
soft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/soft87k.parquet")
soft.dropna(inplace=True)
soft["text"] = soft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
soft["label"] = soft.apply(lambda x: format_label(x.winner), axis=1)


In [6]:
soft = soft[~soft.prompt.isin(ft.prompt)]  # prevent leak

In [7]:
soft["logits_qwencd_cali"] = soft["logits_qwencd"]
soft["logits_qwen32_cali"] = soft["logits_qwen32"]
kf = KFold(n_splits=40, shuffle=True, random_state=66)
for train_index, val_index in kf.split(soft):
    soft_train, soft_val = soft.iloc[train_index], soft.iloc[val_index]
    print(len(soft_train), len(soft_val))
    break

85478 2192


## Dataloader

In [8]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )

soft_train_dataset = Dataset.from_pandas(soft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
soft_val_dataset = Dataset.from_pandas(soft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
raw_dataset = DatasetDict({
    'soft_train':soft_train_dataset,
    'soft_val': soft_val_dataset
})

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=66)
for i, (train_index, val_index) in enumerate(skf.split(ft, ft["language"])):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd_cali", "logits_qwen32_cali"]])
    raw_dataset[f"ft_train_fold{i}"] = ft_train_dataset
    raw_dataset[f"ft_val_fold{i}"] = ft_val_dataset


tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset




38749 9688
38749 9688
38750 9687
38750 9687
38750 9687


Map: 100%|██████████| 85478/85478 [01:20<00:00, 1056.27 examples/s]
Map: 100%|██████████| 2192/2192 [00:02<00:00, 1046.07 examples/s]
Map: 100%|██████████| 38749/38749 [00:44<00:00, 867.02 examples/s] 
Map: 100%|██████████| 9688/9688 [00:11<00:00, 849.23 examples/s]
Map: 100%|██████████| 38749/38749 [00:46<00:00, 839.71 examples/s] 
Map: 100%|██████████| 9688/9688 [00:11<00:00, 853.10 examples/s] 
Map: 100%|██████████| 38750/38750 [00:43<00:00, 881.34 examples/s] 
Map: 100%|██████████| 9687/9687 [00:10<00:00, 890.85 examples/s] 
Map: 100%|██████████| 38750/38750 [00:44<00:00, 869.02 examples/s] 
Map: 100%|██████████| 9687/9687 [00:10<00:00, 886.07 examples/s] 
Map: 100%|██████████| 38750/38750 [00:42<00:00, 901.41 examples/s] 
Map: 100%|██████████| 9687/9687 [00:10<00:00, 894.51 examples/s] 


DatasetDict({
    soft_train: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 85478
    })
    soft_val: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 2192
    })
    ft_train_fold0: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 38749
    })
    ft_val_fold0: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 9688
    })
    ft_train_fold1: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 38749
    })
    ft_val_fold1: Dataset({
        features: ['labels', 'logits_qwencd_cali', 'logits_qwen32_cali', 'input_ids', 'attention_mask'],
        num_rows: 9688
    })
    ft_

In [9]:
i = 3
print(tokenizer.decode(tokenized_dataset["soft_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["soft_val"][i]["labels"])

<|User Prompt|>
in the following WHILE of an app that is a python chatbot made with langchain, I have the problem that when selecting options 5 and 6, the system does not respond when I enter the text #NAME_1, which is the call to chat again and set the user as active.
I need to solve why #NAME_1 does not work when I choose options 5 and 6.

while True:
    id_conversacion = input("Ingresa tu ID de usuario: ").strip()

    # Add the user to the list of active users
    if id_conversacion in usuarios["inactivos"]:
        usuarios["inactivos"].pop(id_conversacion, None)
    usuarios["activos"].append(id_conversacion)
    guardar_usuarios(usuarios)

    if id_conversacion not in history:
        history[id_conversacion] = []
        break
    print("Bienvenido NAME_2!")

while True:
    print()
    opcion = obtener_opcion()

    if opcion == "#NAME_1":
        chatbot_pausado[id_conversacion] = not chatbot_pausado[id_conversacion]
        if chatbot_pausado[id_conversacion]:
            

In [10]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/stage_qft/dataset/tokenized_qwen32b_final")

Saving the dataset (1/4 shards):  30%|██▉       | 25370/85478 [00:00<00:00, 74380.05 examples/s] 

Saving the dataset (4/4 shards): 100%|██████████| 85478/85478 [00:01<00:00, 56604.68 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2192/2192 [00:00<00:00, 56741.70 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38749/38749 [00:00<00:00, 62896.66 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9688/9688 [00:00<00:00, 71856.50 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38749/38749 [00:00<00:00, 62284.18 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9688/9688 [00:00<00:00, 80544.26 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38750/38750 [00:00<00:00, 65805.03 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9687/9687 [00:00<00:00, 76349.21 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 38750/38750 [00:00<00:00, 64180.24 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 9687/9687 [00:00<00:00, 73983.97 examples/s]
Saving the dataset (2/2 shards): 100

In [3]:
from huggingface_hub import snapshot_download
snapshot_download(
  repo_id = "Qwen/Qwen2.5-32B-Instruct",
  local_dir = "/group-volume/binfeng/hf_models/Qwen2.5-32B-Instruct",
)

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

Fetching 27 files: 100%|██████████| 27/27 [08:32<00:00, 18.99s/it]


'/group-volume/binfeng/hf_models/Qwen2.5-32B-Instruct'

In [4]:
snapshot_download(
  repo_id = "unsloth/DeepSeek-R1-GGUF",
  local_dir = "/group-volume/binfeng/hf_models/DeepSeek-R1-GGUF",
  allow_patterns = ["*UD-IQ1_S*"], # Select quant type UD-IQ1_S for 1.58bit
)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Fetching 3 files: 100%|██████████| 3/3 [13:34<00:00, 271.59s/it]


'/group-volume/binfeng/hf_models/DeepSeek-R1-GGUF'