In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import RewardTrainer, RewardConfig

# TODO:
# 1. Load the dataset from 'answers.csv'
# 2. Clean and normalize the text data 
# 3. Tokenize the prompts and answers
# 4. Create a custom Dataset class for the reward model
# 5. Set up a DataLoader for batching
# 6. Initialize the TinyLlama model for sequence classification
# 7. Configure the RewardTrainer with hyperparameters
# 8. Train the reward model

# === Phase 1: Dataset Preparation ===

# 1. Load & Inspect
df = pd.read_csv('answers.csv')
print("First 5 rows:\n", df.head(), "\n")
print("Missing values:\n", df.isna().sum(), "\n")

# 2. Clean & Normalize
df['prompt'] = df['prompt'].str.strip()
df['answer'] = df['answer'].str.strip()

# 3. Convert ranking data to pairwise preference format
def create_preference_pairs(df):
    """Convert ranking data to pairwise preference data for RewardTrainer"""
    pairs = []
    
    # Group by prompt to get all answers for each prompt
    for prompt, group in df.groupby('prompt'):
        # Sort by rank (assuming lower rank = better)
        sorted_group = group.sort_values('rank')
        answers = sorted_group['answer'].tolist()
        ranks = sorted_group['rank'].tolist()
        
        # Create pairwise comparisons (chosen has better rank than rejected)
        for i in range(len(answers)):
            for j in range(i + 1, len(answers)):
                if ranks[i] < ranks[j]:  # Lower rank is better
                    pairs.append({
                        'chosen': answers[i],
                        'rejected': answers[j]
                    })
                elif ranks[i] > ranks[j]:  # Higher rank is worse
                    pairs.append({
                        'chosen': answers[j],
                        'rejected': answers[i]
                    })
    
    return pd.DataFrame(pairs)

# Create preference dataset
preference_df = create_preference_pairs(df)
print(f"Created {len(preference_df)} preference pairs")

# 4. Tokenization Setup
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set padding token properly
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
# Ensure the model knows about the padding token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# 5. Create HuggingFace Dataset
dataset = Dataset.from_pandas(preference_df)

print(f"Dataset created with {len(dataset)} samples")
print("Dataset columns:", dataset.column_names)
print("Sample:", dataset[0])

# === Phase 2: Model Setup ===

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=1,  # For reward modeling
    pad_token_id=tokenizer.pad_token_id  # Explicitly set pad_token_id
)

# Resize model embeddings if needed (now that model is loaded)
model_vocab_size = model.config.vocab_size
tokenizer_vocab_size = len(tokenizer)
if model_vocab_size != tokenizer_vocab_size:
    model.resize_token_embeddings(len(tokenizer))

# === Phase 3: Training Configuration ===

reward_config = RewardConfig(
    output_dir="./reward_model_output",
    learning_rate=1e-5,
    per_device_train_batch_size=1,  # Reduced to 1 to avoid padding issues
    max_steps=100,
    logging_steps=10,
    save_steps=25,
    fp16=False,
    bf16=False,
    dataloader_pin_memory=False,
    use_cpu=True,  # Set to False if you have GPU
    remove_unused_columns=False,
    max_length=512,  # Maximum sequence length
    disable_dropout=True,
    dataloader_drop_last=False  # Don't drop incomplete batches
)

# === Phase 4: Training ===

trainer = RewardTrainer(
    model=model,
    args=reward_config,
    processing_class=tokenizer,
    train_dataset=dataset
)

print("Starting training...")
trainer.train()

# === Phase 5: Save Model ===

print("Saving model...")
trainer.model.save_pretrained('./reward_model')
tokenizer.save_pretrained('./reward_model')

print("Training completed successfully!")

# === Optional: Test the trained model ===
# def test_reward_model():
#     """Quick test of the trained reward model"""
#     from transformers import pipeline
    
#     # Load the trained model
#     reward_model = pipeline(
#         "text-classification",
#         model="./reward_model",
#         tokenizer="./reward_model",
#         device=-1  # Use CPU
#     )
    
#     # Test with sample texts
#     test_texts = [
#         "This is a great answer!",
#         "This is a poor answer."
#     ]
    
#     for text in test_texts:
#         result = reward_model(text)
#         print(f"Text: '{text}' -> Reward: {result[0]['score']:.4f}")

# Uncomment to test the model after training
# test_reward_model()

  from .autonotebook import tqdm as notebook_tqdm


First 5 rows:
                                               prompt  \
0                        Tell me a joke about robots   
1                        Tell me a joke about robots   
2                        Tell me a joke about robots   
3                        Tell me a joke about robots   
4  Summarize the following paragraph: 'Climate ch...   

                                              answer  rank  
0  Why did the robot go to therapy? It had too ma...     1  
1  Knock knock. Who’s there? Robot. Robot who? Ro...     4  
2  Robots are taking over but they still can’t ma...     3  
3     A robot walks into a bar... and orders a byte.     2  
4  Climate change is the long-term alteration of ...     2   

Missing values:
 prompt    0
answer    0
rank      0
dtype: int64 



Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: RewardConfig.__init__() got an unexpected keyword argument 'max_train_steps'