In [1]:
!git clone https://github.com/AlexZheng-UCLA/LLM-evaluation.git

fatal: destination path 'LLM-evaluation' already exists and is not an empty directory.


In [2]:
%cd LLM-evaluation
!pip install -r requirements.txt

/content/LLM-evaluation
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.auto import tqdm
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import mean_absolute_error

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Model and Tokenizer

In [5]:
# model and tokenizer
model_name = "OpenAssistant/reward-model-deberta-v3-base"
model, tokenizer = AutoModelForSequenceClassification.from_pretrained(model_name), AutoTokenizer.from_pretrained(model_name)
question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
inputs = tokenizer(question, answer, return_tensors='pt')
score = model(**inputs).logits[0].cpu().detach()
print(score)

tensor([0.5816])


## Dataset and Dataloader

In [6]:
# dataset 
class OasstDataset(Dataset):
    def __init__(self, filename, tokenizer):
        self.data = pd.read_csv(filename)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text1, text2, label = row['prompt_text'], row['answer_text'], row['quality']
        inputs = self.tokenizer(text1, text2, padding='max_length', max_length=512, truncation=True)
        inputs = {key: torch.tensor(val) for key, val in inputs.items()}  # Convert lists to tensors
        inputs['labels'] = torch.tensor(label).unsqueeze(0).float()
        return inputs

# Create Dataloader
train_dataset_full = OasstDataset('dataset/oasst1_quality_train.csv', tokenizer)
eval_dataset_full = OasstDataset('dataset/oasst1_quality_val.csv', tokenizer)
print(f"train_dataset size: {len(train_dataset_full)}")
print(f"eval_dataset size: {len(eval_dataset_full)}")

train_dataset size: 52127
eval_dataset size: 2745


In [7]:
import random
train_indices = random.sample(range(len(train_dataset_full)), 1000)
train_dataset_small = Subset(train_dataset_full, train_indices)
train_indices = random.sample(range(len(train_dataset_full)), 5000)
train_dataset_medium = Subset(train_dataset_full, train_indices)

## Training Config

In [8]:
# Config
batch_size = 8
train_dataloader = DataLoader(train_dataset_full, shuffle=True, batch_size=batch_size)
path_to_save_model = "LLM_evaluator_full_dataset"

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# Create a summary writer
writer = SummaryWriter()

## Train and Evaluating

In [9]:
class EarlyStopping:
    def __init__(self, patience=2, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.path = path
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_score, model):

        score = val_score

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(score, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.save_checkpoint(score, model)
            self.best_score = score
            self.counter = 0

    def save_checkpoint(self, val_score, model):
      model.save_pretrained(path_to_save_model)
      print(f'Validation loss decreased ({self.best_score:.6f} --> {val_score:.6f}).  Saving model ...')


early_stopping = EarlyStopping(patience=2, delta=0.0001)

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()

# Store all loss and accuracy values
loss_values = []
accuracy_values = []

for epoch in range(num_epochs):
    # Training
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss_values.append(loss.item())  # record the loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        # Write loss to TensorBoard for every step
        writer.add_scalar('Loss/train', loss, epoch*len(train_dataloader)+i)

    # Calculate and print mean loss
    mean_loss = sum(loss_values) / len(loss_values)
    print(f'Mean loss at epoch {epoch}: {mean_loss}')
    loss_values = []  # Reset for the next epoch

    # Evaluation
    model.eval()

    # Create a random subset of 100 samples from the evaluation dataset
    eval_subset_indices = random.sample(range(len(eval_dataset_full)), 100)
    eval_subset = Subset(eval_dataset_full, eval_subset_indices)
    eval_dataloader_subset = DataLoader(eval_subset, batch_size=batch_size)

    for i, batch in enumerate(eval_dataloader_subset):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
          outputs = model(**batch)

        predictions = outputs.logits.squeeze().cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        mae = mean_absolute_error(labels, predictions)

        accuracy_values.append(mae)

        # Write accuracy to TensorBoard for every step
        writer.add_scalar('Accuracy/val', mae, epoch*len(eval_dataloader_subset)+i)

    # Calculate and print mean accuracy
    mean_accuracy = sum(accuracy_values) / len(accuracy_values)
    print(f'Mean Abosulute Error at epoch {epoch}: {mean_accuracy}')
    accuracy_values = []  # Reset for the next epoch

    # early stopping
    early_stopping(mean_accuracy, model)
    if early_stopping.early_stop:
      print(f"Early stopping at epoch -- {epoch}")
      tokenizer.save_pretrained(path_to_save_model)
      break
    # Switch back to training mode
    model.train()

writer.close()

  0%|          | 0/65160 [00:00<?, ?it/s]

In [None]:
import os
import glob
all_dirs = glob.glob(os.path.join("runs", "*"))

# Sort the directories based on modification time (most recent first)
sorted_dirs = sorted(all_dirs, key=os.path.getmtime, reverse=True)
newest_logdir = sorted_dirs[0]
print(newest_logdir)

%load_ext tensorboard
%tensorboard --logdir newest_logdir

## Save to google drive

In [None]:
des_dir = f"/content/gdrive/Shareddrives/CS_263_Shared"
if not os.path.exists(des_dir):
  os.mkdir(des_dir)
!cp -r /content/LLM-evaluation/{path_to_save_model} {des_dir}

### Quick examination

In [None]:
model.to("cpu")
question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
inputs = tokenizer(question, answer, return_tensors='pt')
score = model(**inputs).logits[0].cpu().detach()
print(score)

## Testing

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class OasstDataset(Dataset):
    def __init__(self, filename, tokenizer):
        self.data = pd.read_csv(filename)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text1, text2, label = row['prompt_text'], row['answer_text'], row['quality']
        inputs = self.tokenizer(text1, text2, padding='max_length', max_length=512, truncation=True)
        inputs = {key: torch.tensor(val) for key, val in inputs.items()}  # Convert lists to tensors
        inputs['labels'] = torch.tensor(label).unsqueeze(0).float()
        return inputs

model_name = "/content/gdrive/Shareddrives/CS_263_Shared/LLM_evaluator_small_dataset"
model, tokenizer = AutoModelForSequenceClassification.from_pretrained(model_name), AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Create Dataloader
test_dataset_full = OasstDataset('/content/LLM-evaluation/dataset/oasst1_quality_val.csv', tokenizer)
test_dataloader = DataLoader(test_dataset_full, batch_size=8, shuffle=False)

# Save outputs to a list
predictions_list = []

# Iterate over batches
for batch in tqdm(test_dataloader, desc="Evaluating"):
    # Send batch to device
    batch = {k: v.to(device) for k, v in batch.items()}
    # Calculate outputs
    with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits[0].cpu().numpy()
        predictions_list.append(predictions)


In [None]:
print(predictions_list)

## Upload to hugging face

In [None]:
!huggingface-cli login

In [None]:
!huggingface-cli repo create {path_to_save_model}

[90mgit version 2.25.1[0m
[90mgit-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)[0m

You are about to create [1mAlexZheng/LLM_evaluator_medium_dataset[0m
Proceed? [Y/n] 

In [None]:
!git config --global user.email "AlexZheng-UCLA@gmail.com"
!git config --global user.name "AlexZheng-UCLA"

In [None]:
%cd {path_to_save_model}

# Initialize a git repository in this directory
!git init

# Add the model and tokenizer files to the git repository
!git add .

# Commit the changes
!git commit -m "Initial commit"

# Push the model and tokenizer to the Hugging Face's Model Hub
!git push https://huggingface.co/AlexZheng/{path_to_save_model}