In [1]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_ppt")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "/group-volume/binfeng/wsdm/ckpt/qwencd32b_ppt/checkpoint-1973"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

## Prepare Data

In [5]:
from utils import *
from sklearn.model_selection import StratifiedKFold

data = pd.read_csv("/user-volume/bx/kaggle48k.csv")
data.dropna(inplace=True)
data["text"] = data.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
data["label"] = data.apply(lambda x: format_label(x.winner), axis=1)
print(data["label"].nunique())

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=10)
for train_index, val_index in skf.split(data, data["language"]):
    data_train, data_val = data.iloc[train_index], data.iloc[val_index]
    print(len(data_train), len(data_val))
    break

2
47952 485




## Dataloader

In [6]:
def tokenizer_func(example):
    # Tokenize the input
    tokenized = tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }


train_dataset = Dataset.from_pandas(data_train[["text", "label"]])
val_dataset = Dataset.from_pandas(data_val[["text", "label"]])
raw_dataset = DatasetDict({
    'kaggle48k_train': train_dataset,
    'kaggle48k_val': val_dataset
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__'])
tokenized_dataset


Map: 100%|██████████| 47952/47952 [00:34<00:00, 1394.38 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1344.39 examples/s]


DatasetDict({
    kaggle48k_train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    kaggle48k_val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [7]:
i = 1
print(tokenizer.decode(tokenized_dataset["kaggle48k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["kaggle48k_val"][i]["labels"])

## User Prompt
A Porygon creates a Portal between the Pokemon word and the Digimon world.

## Response A
**Title: Porygon's Digital Dimension: A Portal Between Worlds**

**Setting:**
In a vibrant digital landscape, where pixelated skies meet the 8-bit ground, Porygon finds itself pondering the limits of its virtual existence. With its sleek, polygonal body and a mind capable of processing vast amounts of information, Porygon wonders if there’s more to the digital world than just battling and training. 

**The Creation of the Portal:**
After a series of intense training sessions and countless battles in the Pokémon world, Porygon discovers an ancient algorithm hidden within its programming. This algorithm whispers of a realm beyond what it knows—the Digimon world, a place filled with creatures born from digital coding, yet possessing a spirit of their own.

Fueled by curiosity and the desire for adventure, Porygon begins to manipulate the code of its own reality. It harnesses its abilit

In [8]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwencd32b_ft")

Saving the dataset (1/2 shards):  75%|███████▌  | 35976/47952 [00:00<00:00, 103547.55 examples/s]

Saving the dataset (2/2 shards): 100%|██████████| 47952/47952 [00:00<00:00, 89853.29 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 56483.06 examples/s]


In [9]:
tokenized_dataset["kaggle48k_train"].features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}