In [1]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_ppt")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Skywork/Skywork-Reward-Gemma-2-27B-v0.2"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/sky27b")

('/group-volume/binfeng/wsdm/tokenizer/sky27b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/sky27b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/sky27b/tokenizer.json')

## Prepare Data

In [4]:
from utils import *

data = pd.read_csv("/user-volume/bx/ppt127k.csv")
data.dropna(inplace=True)
data["text"] = data.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
data["label"] = data.apply(lambda x: format_label(x.winner), axis=1)
print(data["label"].nunique())

# split
data_train = data[100:]
data_val = data[:100]

Token indices sequence length is longer than the specified maximum sequence length for this model (4900 > 4096). Running this sequence through the model will result in indexing errors


2


## Dataloader

In [6]:
def tokenizer_func(example):
    # Tokenize the input
    tokenized = tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [7]:
train_dataset = Dataset.from_pandas(data_train[["text", "label"]])
val_dataset = Dataset.from_pandas(data_val[["text", "label"]])
raw_dataset = DatasetDict({
    'ppt127k_train': train_dataset,
    'ppt127k_val': val_dataset
})
tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__'])
tokenized_dataset


Map: 100%|██████████| 127209/127209 [01:06<00:00, 1913.13 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1933.13 examples/s]


DatasetDict({
    ppt127k_train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 127209
    })
    ppt127k_val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [8]:
i = 6
print(tokenizer.decode(tokenized_dataset["ppt127k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ppt127k_val"][i]["labels"])

<bos>## User Prompt
Come up with some phrases that a sentient mechanical bull would say. Make the bull talk with a heavy southern accent, and use heavy inflections like \\"ya\\", \\"in'\\" instead of \\"ing\\", \\"this here\\", and any other typical southern\\/redneck lingo. Try to include words like \\"saddle\\", \\"leather\\", \\"torque\\", and other language to give him a masculine, rugged and bad-ass feel. The bull is generally friendly and encouraging, however he has a hidden mischievous, domineering and sadistic streak. Describe in the style of before, during, after completion and after being thrown, and assign a fitting difficulty for each phrase (low, medium, high, full throttle). Group the phrases by context. Lastly, give him a fitting name.

## Response A
Name: Ironhide Rustler \\n\\nBefore Ride:\\n\\nLow Difficulty:\\n1. \\"Well, ain't ya lookin' perky? Just strap on this here saddle, partner. We're in for a smooth ride, ya hear?\\"\\n2. \\"This here's a easy trot, nothin' m

In [10]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_sky27b_ppt")

Saving the dataset (4/4 shards): 100%|██████████| 127209/127209 [00:06<00:00, 18379.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 873.79 examples/s]
