In [1]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_ppt")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "Qwen/Qwen2.5-32B-Instruct"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/qwen32b")

('/group-volume/binfeng/wsdm/tokenizer/qwen32b/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/qwen32b/tokenizer.json')

## Prepare Data

In [4]:
from utils import *

data = pd.read_csv("/user-volume/bx/ppt127k.csv")
data.dropna(inplace=True)
data["text"] = data.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
data["label"] = data.apply(lambda x: format_label(x.winner), axis=1)
print(data["label"].nunique())

# split
data_train = data[1000:]
data_val = data[:1000]

2


## Dataloader

In [6]:
def tokenizer_func(example):
    # Tokenize the input
    tokenized = tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [7]:
train_dataset = Dataset.from_pandas(data_train[["text", "label"]])
val_dataset = Dataset.from_pandas(data_val[["text", "label"]])
raw_dataset = DatasetDict({
    'ppt127k_train': train_dataset,
    'ppt127k_val': val_dataset
})
tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__'])
tokenized_dataset


Map: 100%|██████████| 126309/126309 [01:27<00:00, 1443.45 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1507.62 examples/s]


DatasetDict({
    ppt127k_train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 126309
    })
    ppt127k_val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [8]:
i = 1
print(tokenizer.decode(tokenized_dataset["ppt127k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ppt127k_val"][i]["labels"])

## User Prompt
What is the biggest factor in which kinds of candidates have a possibility of winning a major office in the U.S.?

## Response A
There are several factors that can influence the likelihood of a candidate winning a major office in the United States. Some of the biggest factors include:\n\n1. Political experience and background: Candidates with prior political experience, particularly at the federal or state level, often have an advantage in terms of name recognition and understanding of the political process.\n2. Fundraising ability: Major political campaigns require significant financial resources to reach voters and get their message out. Candidates who are able to raise large amounts of money from donors, either through personal wealth or through a strong network of supporters, are often more competitive.\n3. Party support: In the United States, major political parties play a significant role in selecting and supporting candidates for office. Candidates who are able to

In [9]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwen32b_ppt")

Saving the dataset (4/4 shards): 100%|██████████| 126309/126309 [00:01<00:00, 91133.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 86961.02 examples/s]


In [10]:
tokenized_dataset["ppt127k_train"].features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}