In [1]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_distill")


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-14B-Instruct"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwen14b")

('/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen14b/tokenizer.json')

## Prepare Data

In [4]:
from utils import *
from sklearn.model_selection import StratifiedKFold, KFold

ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_distill/datap/ft48k.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner), axis=1)

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=10)
for train_index, val_index in skf.split(ft, ft["language"]):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    break


47952 485




In [5]:
ppt = pd.read_parquet("/group-volume/binfeng/wsdm/stage_distill/datap/ppt135k.parquet")
ppt.dropna(inplace=True)
ppt["text"] = ppt.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ppt["label"] = ppt.apply(lambda x: format_label(x.winner), axis=1)

kf = KFold(n_splits=1000, shuffle=True, random_state=10)
for train_index, val_index in kf.split(ppt):
    ppt_train, ppt_val = ppt.iloc[train_index], ppt.iloc[val_index]
    print(len(ppt_train), len(ppt_val))
    break


135588 136


## Dataloader

In [6]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )


ppt_train_dataset = Dataset.from_pandas(ppt_train[["text", "label", "logits_qwencd", "logits_qwen32"]])
ppt_val_dataset = Dataset.from_pandas(ppt_val[["text", "label", "logits_qwencd", "logits_qwen32"]])
ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd", "logits_qwen32"]])
ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd", "logits_qwen32"]])
raw_dataset = DatasetDict({
    'ppt135k_train': ppt_train_dataset,
    'ppt135k_val': ppt_val_dataset,
    'ft48k_train': ft_train_dataset,
    'ft48k_val': ft_val_dataset,
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset


Map: 100%|██████████| 135588/135588 [01:34<00:00, 1432.87 examples/s]
Map: 100%|██████████| 136/136 [00:00<00:00, 1472.10 examples/s]
Map: 100%|██████████| 47952/47952 [00:35<00:00, 1339.44 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1358.12 examples/s]


DatasetDict({
    ppt135k_train: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 135588
    })
    ppt135k_val: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 136
    })
    ft48k_train: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    ft48k_val: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [9]:
tokenized_dataset["ft48k_train"]["logits_qwen32"][0]

[1.9086914, -1.5205078]

In [15]:
i = 0
print(tokenizer.decode(tokenized_dataset["ft48k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft48k_val"][i]["labels"])

<|User Prompt|>
I'm looking to self-host a fully integrated DB platform that supports the database, authentication (oauth) for customers of the system, and granular permissions with an API to access DB data. Can you compare platforms like Supabase, appwrite, budibase, directus, surrealDB, parse, couchbase, and any other suggestions. Particularly on how easy it is to self host and what feature parity self hosting has. Please note if the backend is something generic that could be easily migrated away from.

scale: No huge amounts of data, just medium size catalogues of products for online stores.
features: DO NOT need realtime or spatial, but full text search would be nice
ease of use: Ideally something with a GUI or WebUI for graphical monitoring/management
cost: free/open source is ideal. no concerns on hardware costs, that's already covered

It should be able to host multiple sites and have customer oauth for each

Put the comparison into a table

<|Response A|>
Certainly! Below is a 

In [13]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwen14b_distill")

Saving the dataset (3/3 shards): 100%|██████████| 135588/135588 [00:01<00:00, 112465.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 136/136 [00:00<00:00, 19768.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 47952/47952 [00:00<00:00, 133743.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 55947.12 examples/s]
