In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from trl import RewardTrainer, RewardConfig

# TODO:
# 1. Load the dataset from 'answers.csv'
# 2. Clean and normalize the text data 
# 3. Tokenize the prompts and answers
# 4. Create a custom Dataset class for the reward model
# 5. Set up a DataLoader for batching
# 6. Initialize the TinyLlama model for sequence classification
# 7. Configure the RewardTrainer with hyperparameters
# 8. Train the reward model

# === Phase 1: Dataset Preparation ===

# 1. Load & Inspect
df = pd.read_csv('answers.csv')
print("First 5 rows:\n", df.head(), "\n")
print("Missing values:\n", df.isna().sum(), "\n")

# 2. Clean & Normalize
df['prompt'] = df['prompt'].str.strip()
df['answer'] = df['answer'].str.strip()

# — Research on lowercasing —
# TinyLlama’s tokenizer is case-sensitive (it uses SentencePiece), so we typically do NOT lowercase—
# preserving case usually yields better performance.

# 3. Tokenization
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_row(row):
    text = "<PROMPT> " + row['prompt'] + " <RESPONSE> " + row['answer']
    return tokenizer(text, truncation=True, return_attention_mask=True)

tokenized = df.apply(tokenize_row, axis=1)
input_ids     = [t["input_ids"]     for t in tokenized]
attention_mask= [t["attention_mask"] for t in tokenized]
ranks         = df["rank"].tolist()

# 4. Dataset Class
class RewardDataset(Dataset):
    def __init__(self, input_ids, attention_mask, ranks):
        self.input_ids = input_ids
        self.attn_mask = attention_mask
        self.ranks     = ranks

    def __len__(self):
        return len(self.ranks)

    def __getitem__(self, idx):
        return {
            "input_ids":      torch.tensor(self.input_ids[idx],      dtype=torch.long),
            "attention_mask": torch.tensor(self.attn_mask[idx],      dtype=torch.long),
            "labels":         torch.tensor(self.ranks[idx],          dtype=torch.long),
        }

dataset = RewardDataset(input_ids, attention_mask, ranks)

# 5. Batching & Collation
collator   = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=collator)

# === Phase 2: Reward Model Traing ===
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=1
)

reward_config = RewardConfig(
    output_dir="reward_model", 
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    max_steps=100,
    logging_steps=10,
    save_steps=25
)

trainer = RewardTrainer(
    config=reward_config,
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset
)

trainer.train()

trainer.model.save_pretrained('reward_model')
tokenizer.save_pretrained('reward_model')

  from .autonotebook import tqdm as notebook_tqdm


First 5 rows:
                                               prompt  \
0                        Tell me a joke about robots   
1                        Tell me a joke about robots   
2                        Tell me a joke about robots   
3                        Tell me a joke about robots   
4  Summarize the following paragraph: 'Climate ch...   

                                              answer  rank  
0  Why did the robot go to therapy? It had too ma...     1  
1  Knock knock. Who’s there? Robot. Robot who? Ro...     4  
2  Robots are taking over but they still can’t ma...     3  
3     A robot walks into a bar... and orders a byte.     2  
4  Climate change is the long-term alteration of ...     2   

Missing values:
 prompt    0
answer    0
rank      0
dtype: int64 



Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: RewardConfig.__init__() got an unexpected keyword argument 'max_train_steps'