### Setup imports and environment variables.

In [None]:
%%capture
! rm -rf Interpreting-Reward-Models || true
! git clone https://github.com/apartresearch/Interpreting-Reward-Models.git
! cd Interpreting-Reward-Models && pip install .

In [None]:
import os
import huggingface_hub
import torch
import wandb

from datasets import Dataset, load_dataset
from huggingface_hub import hf_hub_download, upload_file, upload_folder, HfApi
from torch.optim import Adam
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    LlamaTokenizer,
    pipeline
)

from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments
from trl import AutoModelForCausalLMWithValueHead, DPOTrainer
from trl import DPOTrainer

In [None]:
repo_id = 'amirabdullah19852020/interpreting_reward_models'
tqdm.pandas()
huggingface_hub.login()

In [None]:
wandb.login()

In [None]:
from reward_analyzer import get_hh
from reward_analyzer.configs.rlhf_training_config import DPOTrainingConfig

### Set up DPO arguments.

In [None]:
from dataclasses import dataclass, field
from typing import Dict, Optional

import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments

from trl import DPOTrainer

In [None]:
model_name="EleutherAI/gpt-neo-125m"

In [None]:
def train_anthropic_model(model_name):
    script_args = DPOTrainingConfig(model_name_or_path=model_name)
    model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path).cuda()

    if script_args.ignore_bias_buffers:
        # torch distributed hack
        model._ddp_params_and_buffers_to_ignore = [
            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
        ]

    model_ref = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path).cpu()
    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    train_dataset = get_hh("train", sanity_check=script_args.sanity_check)
    eval_dataset = get_hh("test", sanity_check=True)

    training_args = TrainingArguments(
            per_device_train_batch_size=script_args.per_device_train_batch_size, max_steps=script_args.max_steps,
            remove_unused_columns=False, gradient_accumulation_steps=script_args.gradient_accumulation_steps,
            learning_rate=script_args.learning_rate, push_to_hub=True,
            hub_model_id=script_args.huggingface_hub_name, evaluation_strategy="steps",
            logging_first_step=True, logging_steps=10,
            eval_steps=2000, output_dir="./test",
            optim="adamw_hf", warmup_steps=150,
            report_to=script_args.report_to, bf16=True,
            gradient_checkpointing=script_args.gradient_checkpointing
    )

    dpo_trainer = DPOTrainer(
        model, model_ref,
        args=training_args, beta=script_args.beta,
        train_dataset=train_dataset, eval_dataset=eval_dataset,
        tokenizer=tokenizer,max_length=script_args.max_length,
        max_target_length=script_args.max_target_length, max_prompt_length=script_args.max_prompt_length,
        generate_during_eval=True
    )
    dpo_trainer.train()
    return dpo_trainer

In [None]:
dpo_trainer = train_anthropic_model(model_name)
dpo_trained.model.save_pretrained(model_name)

### Setup dataset.

In [None]:
import huggingface_hub
huggingface_hub.login()

In [None]:
from datetime import datetime

# Get the current datetime
current_datetime = datetime.now()

# Format it as ISO 8601 string
isoformatted_datetime = current_datetime.isoformat()

In [None]:
api = HfApi()
repo_url = api.create_repo(repo_id=repo_id, repo_type=None, exist_ok=True, token=token)

In [None]:
api.upload_folder(
    repo_id=repo_url.repo_id,
    folder_path=f'./{model_name}',
    path_in_repo=f'models/{model_name}/{isoformatted_datetime}',
    token=token,
    repo_type=None
)