In [1]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import os
import random
import numpy as np
import argparse
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding
)
from transformers import LlamaForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
from accelerate import PartialState
from huggingface_hub import login


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE_STRING = PartialState().process_index

os.chdir("/group-volume/binfeng/wsdm/")

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=64,
    bias="none",
    target_modules="all-linear",
    use_rslora=True,
)


def my_compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int)

    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
    }


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer_path="/group-volume/binfeng/wsdm/tokenizer/gemma9b" 
model_path="princeton-nlp/gemma-2-9b-it-SimPO"
dataset_path="/group-volume/binfeng/wsdm/data/tokenized_gemma9b_ppt"
model_save_path="/group-volume/binfeng/wsdm/ckpt/gemm9b_ppt" 
train_split="infly80k_train"
val_split="infly80k_val"
epoch=1
lr=1e-5
bs=64

In [4]:


print(f'Loading dataset from {dataset_path} ...')
dataset = load_from_disk(dataset_path)
trainset = dataset[train_split]
valset = dataset[val_split] if val_split else None

trainset = trainset.select(range(1000))
valset = valset.select(range(50))


Loading dataset from /group-volume/binfeng/wsdm/data/tokenized_gemma9b_ppt ...


In [None]:
print(f'Loading model and tokenizer ...')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

model = AutoModelForSequenceClassification.from_pretrained(
    args.model_path,
    num_labels=1,
    # problem_type="binary_classification"
    # problem_type="regression",
)
model = get_peft_model(model, peft_config)

if not model.config.pad_token_id:
    model.config.pad_token_id = tokenizer.pad_token_id

print(f'Setting up trainer ...')
training_args = TrainingArguments(
    # optimizing
    learning_rate=args.lr,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    num_train_epochs=args.epoch,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=args.bs // 8,
    weight_decay=args.wd,
    # logging
    save_strategy="epoch",
    do_eval=True,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_steps=0.01,
    output_dir=args.model_save_path,
    # misc
    seed=666,
    fp16=True,
    gradient_checkpointing=True,
    save_only_model=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=valset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=my_compute_metrics,
)

trainer.accelerator.print(f"{trainer.model}")
if hasattr(trainer.model, "print_trainable_parameters"):
    trainer.model.print_trainable_parameters()

print(f'Start training ...')
checkpoint = None
if args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
trainer.train(resume_from_checkpoint=checkpoint)


def set_seeds(seed):
"""Set seeds for reproducibility """
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)




In [1]:
import pandas as pd

In [49]:
df = pd.read_csv("/user-volume/bx/pptsmall52k.csv")
df2 = pd.read_csv("/user-volume/bx/infly80k.csv")

In [51]:
df = pd.concat([df, df2])

In [50]:
tmp = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/ppt196k_calibrated.parquet")

In [52]:
# df["text"] = df.apply(lambda x: str(x["prompt"]) + str(x["response_a"]) + str(x["response_b"]) + str(x["winner"]), axis=1)
# df2["text"] = df2.apply(lambda x: str(x["prompt"]) + str(x["response_a"]) + str(x["response_b"]) + str(x["winner"]), axis=1)
# tmp["text"] = tmp.apply(lambda x: x["prompt"] + x["response_a"] + x["response_b"] + x["winner"], axis=1)

df["text"] = df.apply(lambda x: str(x["prompt"]) + str(x["response_a"]) + str(x["response_b"]), axis=1)
tmp["text"] = tmp.apply(lambda x: x["prompt"] + x["response_a"] + x["response_b"], axis=1)


In [54]:
new = tmp[tmp.prompt.isin(df.prompt.to_list())]
new.drop(["text"], inplace=True, axis=1)
new.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new.drop(["text"], inplace=True, axis=1)


In [58]:
new.shape

(131447, 8)

In [59]:
new.to_parquet("/group-volume/binfeng/wsdm/stage_qft/data/ppt131k_calibrated.parquet")

In [30]:
!ls /user-volume/bx/hfopen

hf-open-models-v1.parquet  hf-open-models-v2.parquet  hf-open-models-v3.parquet


In [31]:
hfopen1 = pd.read_parquet("/user-volume/bx/hfopen/hf-open-models-v1.parquet")
hfopen2 = pd.read_parquet("/user-volume/bx/hfopen/hf-open-models-v2.parquet")
hfopen3 = pd.read_parquet("/user-volume/bx/hfopen/hf-open-models-v3.parquet")

In [35]:
hfopen = pd.concat([hfopen1, hfopen2, hfopen3])

In [38]:
hfopen = hfopen[["prompt", "response_a", "response_b"]]
hfopen.reset_index(drop=True, inplace=True)

In [7]:
hfopen.to_parquet("/group-volume/binfeng/wsdm/stage_qft/data/hfopen26k_unlabel.parquet")

In [17]:
a = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/data/hfopen26k_unlabel.parquet")

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/user-volume/bx/kaggle48k.csv")

In [3]:
df2 = pd.read_parquet("/user-volume/bx/lmsys61k.parquet")

In [4]:
df

Unnamed: 0,prompt,response_a,response_b,winner,language
0,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,Slovak
1,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,Russian
2,"Dört basamaklı, rakamları birbirinden ve sıfır...","Bu soruyu çözmek için, verilen koşulları adım ...",Bu problemi adım adım çözelim:\n\n1) ABCD - DC...,model_a,Turkish
3,"현재 추천된 탑 3 종목인 Cabaletta Bio (CABA), Rocket Ph...","죄송하지만 저는 금융 조언을 제공할 수 없습니다. 저는 AI 모델이며, 투자 결정에...",현재 추천된 탑 3 종목에 순위를 매기기 위해서는 여러 가지 요소들을 고려해야 합니...,model_b,English
4,Please be boring,"Alright, I'll be as boring as possible.\n\nTod...","Understood. Here is a straightforward, unadorn...",model_a,English
...,...,...,...,...,...
48432,你好，请你介绍下你自己,你好！我是LSK，一个形式为人工智能的chatbot。,### 介绍\n\n我是一款人工智能语言模型，基于深度学习技术开发而成。我的主要功能是理解自...,model_b,Chinese
48433,Viva Technology参展费用,Viva Technology是一个国际性的科技展览会，每年在法国巴黎举行。参展费用会根据多...,我无法找到Viva Technology参展费用的具体公开信息。\n\n**参展费用通常取决...,model_a,English
48434,다음은 '사무장병원 경찰 조사 동행기 - 장효강 변호사의 현장 이야기'라는 제목의 ...,"사무장병원 경찰 조사 동행기 - 장효강 변호사의 현장 이야기\n\n**""압수수색, ...","도입부를 개선하기 위해서는 독자의 관심을 더 끌어야 하며, 좀 더 명료하고 간결하게...",model_a,Korean
48435,What is the height of the church located in Sa...,**Church Information: Église de Saint-Nazaire-...,I couldn't find any information about a church...,model_a,English


In [5]:
df2

Unnamed: 0,prompt,response_a,response_b
0,\t\n1900\n2000\n2100\n2200\n2300\n0000\n0100\n...,"I'm sorry, but I cannot predict the temperatur...",The temperature at 00:00 will be 14°C.\r\n\r\n...
1,"\t\nHere is the related information-""In 1066, ...",The Anglo-Saxons were the ruling class ahead o...,The ruling class in England before the Normans...
2,"\t\nHere is the related information-""The Norma...",NAME_2 is located in France.,NAME_2 is a region in France.
3,\tPROMPT\n\n\tAct as a a world-renowned expert...,"Sure, here is the information about the 10 whi...","Here are the 10 whiskies in the list, along wi..."
4,"\textract NAME_1 pay, taxes, overtime and grou...",Here is the code that I wrote to extract the d...,"Here is the extracted data for each employee, ..."
...,...,...,...
61284,💧,Great! Is there anything else you'd like to kn...,The emoji you provided appears to be an image ...
61285,🖕,"I'm sorry, I don't understand what you mean by...",🤩
61286,"🤓: [Your default ChatGPT response, including a...","I'm sorry, but I cannot generate inappropriate...","I'm sorry, but I cannot provide a response to ..."
61287,🤗 Huggingface provides a very large repository...,There are 11 models containing the name 'disti...,There are 15 models containing the name 'disti...


In [1]:
import pandas as pd


df = pd.read_parquet("/group-volume/binfeng/wsdm/stage_qft/plabels/qwen14b_tie_plabel_kaggle.parquet")

In [5]:
df["a_win_delta"] = df["logits_model_a"] - df["logits_model_b"]
wrong_df = df[(df["winner"]=="model_b") & (df["a_win_delta"] > 0)]
wrong_df2 = df[(df["winner"]=="model_a") & (df["a_win_delta"] < 0)]
wrong_df2["a_win_delta"] = -wrong_df2["a_win_delta"]
wrong = pd.concat([wrong_df, wrong_df2])
wrong.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_df2["a_win_delta"] = -wrong_df2["a_win_delta"]


In [9]:
wrong = wrong.sort_values("a_win_delta", ascending=False)

In [16]:
i = 1
print(wrong["text"][i])
print(wrong["winner"][i])

<|User Prompt|>
I want to add a graph of the lowest price by time of a item by tier.
I want to add a graph of the quantities distribution over time.
Add more necessary graphs you think could enhance this dashboard.

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Eclesiar Market Visualization</title>
    <!-- Include Chart.js from CDN -->
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        #controls { margin-bottom: 20px; }
        #charts { display: flex; flex-direction: column; gap: 40px; }
        .chart-container { width: 100%; }
        canvas { max-width: 100%; }
        .hidden { display: none; }
    </style>
</head>
<body>
    <h1>Eclesiar Market Visualization</h1>
    <div id="controls">
        <!-- Add file upload input here -->
        <label for="fileInput">Upload Market Log File:</label>
        <input type="file" id="fileInput" accept=".j

In [19]:
df["pred"] = df.apply(lambda x: "model_a" if x["logits_model_a"] > x["logits_model_b"] else "model_b", axis=1)
df["correct"] = df["pred"] == df["winner"]

In [20]:
hard_df = df[df["correct"]==False].reset_index(drop=True)

In [23]:
hard_df["len"] = hard_df["input_ids"].apply(len)


In [33]:
sum(df["len"] > 2900) / len(df)

0.04895018271156348

In [29]:
df["len"] = df["input_ids"].apply(len)
df["len"].describe()

count    48437.000000
mean      1241.788674
std        776.874691
min         42.000000
25%        624.000000
50%       1121.000000
75%       1712.000000
max       3002.000000
Name: len, dtype: float64

In [17]:
import vllm

  from .autonotebook import tqdm as notebook_tqdm


INFO 01-31 04:58:37 __init__.py:183] Automatically detected platform cuda.


2025-01-31 04:58:38,193	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [18]:
import vllm
print(vllm.__version__)

0.7.0
