In [1]:
import torch
torch.__version__

'2.2.1+cu121'

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    TrainerCallback,
)
from datasets import load_dataset
import evaluate
import torch
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
import numpy as np

# Importing dp_transformers modules
import dp_transformers
from dp_transformers import TrainingArguments as DPTrainingArguments, PrivacyArguments
from dp_transformers.dp_utils import OpacusDPTrainer
from torch.utils.data import DataLoader
from tqdm import tqdm

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:

id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [4]:
# this is different than the guide can be removed
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
import torch
from transformers import AutoModelForSequenceClassification

In [6]:
device = "cuda"

In [7]:
if not torch.cuda.is_available():
    raise RuntimeError("No GPU found. A GPU is needed for quantization.")


In [8]:
model = AutoModelForSequenceClassification.from_pretrained("/model-weights/Llama-2-7b-hf",
                                                           num_labels=3,
                                                           label2id=label2id,
                                                           id2label=id2label,
                                                           quantization_config=bnb_config,)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /model-weights/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer = AutoTokenizer.from_pretrained("/model-weights/Llama-2-7b-hf", return_tensors="pt", 
                                          model_max_length=150)

tokenizer.pad_token = tokenizer.eos_token

In [10]:
orig_dataset = load_dataset("parquet", data_files="data_balanced.parquet")

dataset = orig_dataset.rename_column("Lable", "label").rename_column("Text", "text")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

def preprocess_fn(examples):
    return tokenizer(examples["text"], truncation=True)

toked = dataset.map(preprocess_fn, batched=True).remove_columns("text")

def change_labels(example):
  label = example["label"]
  n_label = 0 if label == "negative" else 1 if label == "neutral" else 2
  example["label"] = n_label
  return example

toked = toked.map(change_labels)

In [11]:
orig_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Lable'],
        num_rows: 9093
    })
})

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8183
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 910
    })
})

In [13]:
toked

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 8183
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 910
    })
})

In [14]:
class DataCollatorForPrivateWithPadding(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer)

    def __call__(self, examples):
        batch = super().__call__(examples)


        # Huggingface's default way of constructing position_ids is not compatible with Opacus
        # since Opacus is not able to deduce the batch size from the input. Here we manually
        # generate a position_ids tensor which has the same values as Huggingface's default tensor
        # but it is constructed in a way that is compatile with Opacus by using expand_as.
        if "position_ids" not in batch:
            input_ids = batch["input_ids"]
            batch["position_ids"] = torch.arange(
                input_ids.shape[1], dtype=torch.long, device=input_ids.device
            ).repeat(input_ids.shape[0], 1)
        return batch

In [15]:
# data_collator = dp_transformers.DataCollatorForPrivateCausalLanguageModeling(tokenizer)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator = DataCollatorForPrivateWithPadding(tokenizer)

In [16]:
accuracy = evaluate.load("accuracy")


In [17]:
def compute_metrics(eval_pred):
    print(f"eval preds: {eval_pred}")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:

# this is different than the guide (llama is a bit different, should define pad token)
model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = False 

In [19]:
# this is different than the guide can be removed
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=['v_proj', 'down_proj', 'up_proj', 'q_proj', 'gate_proj', 'k_proj', 'o_proj'],
)
#model.add_adapter(peft_config) can be removed
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [20]:
class SaveScoreCallback(TrainerCallback):  
    def __init__(self, model, trainer) -> None:
        super().__init__()
        self.model = model
        self.trainer = trainer

    def on_save(self, 
                args, 
                state,
                control,
                **kwargs ):
        with open(f"{args.output_dir}/checkpoint-{state.global_step}/privacy.log", 'a') as f:
            f.write(f"prv epsilon: {self.trainer.get_prv_epsilon()}\n")
            f.write(f"rpd epsilon: {self.trainer.get_rdp_epsilon()}\n")
        self.model.save_pretrained(f"{args.output_dir}/checkpoint-{state.global_step}/")
        fname = f"{args.output_dir}/checkpoint-{state.global_step}/score.original_module.pt"
        torch.save(model.model.score.original_module.state_dict(), fname)

In [21]:
import datasets
import dp_transformers
import transformers
import sys
import logging

from dataclasses import dataclass, field, asdict
from peft import get_peft_model, LoraConfig

from dp_transformers.grad_sample.transformers import conv_1d

In [22]:
# Differential Privacy arguments
privacy_args = PrivacyArguments(
    target_epsilon= 8,
    per_sample_max_grad_norm = 1.0
)


In [23]:
# Adjusted TrainingArguments for differential privacy
dp_training_args = DPTrainingArguments(
    output_dir="tweet_shield_dp_trained_model",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    # save_strategy="steps",
    # save_steps=10,
    remove_unused_columns=False,
    # label_names=[0,1,2],
)

In [24]:
dl = DataLoader(toked["test"], collate_fn=data_collator,batch_size=1)

In [25]:
class EvaluationCallback(TrainerCallback):
    def __init__(self, model, eval_dataset, batch_size=6, steps=50):
        super().__init__()
        self.steps = steps
        self.model = model
        self.eval_dataset = eval_dataset
        self.batch_size = batch_size

    def set_trainer(self, trainer):
        self.trainer = trainer

    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step % self.steps == 0:
            loss, accuracy = self.compute_metrics()
            self.log_metrics(loss, accuracy, state.epoch)
            control.should_log = True
        
    def compute_metrics(self):
        with torch.no_grad():
            dl = DataLoader(
                self.eval_dataset,
                collate_fn=DataCollatorForPrivateWithPadding(tokenizer),
                batch_size=self.batch_size
            )
            
            tot_correct = 0
            num = 0
            sum_loss = 0.0

            for batch in tqdm(dl):
                batch_size = batch["labels"].shape[0]
                num += batch_size
                
                output = self.model(**batch)
    
                sum_loss += output["loss"] * batch_size
                
                probs = self.model(**batch)["logits"].softmax(dim=-1)
                preds = torch.argmax(probs, dim=-1)
                
                tot_correct += (batch["labels"] == preds).sum()


            return sum_loss/num, tot_correct/num

    
    def log_metrics(self, loss, acc, epoch):
        logs = {}
        logs["validation_loss"] = loss
        logs["accuracy"] = acc
        print(f"""validation_loss: {loss}
        validation_acc: {acc}
        epoch: {epoch}""")

In [None]:
evaluation_callback = EvaluationCallback(model, toked["test"], steps=1363)

trainer = OpacusDPTrainer(
# trainer = Trainer(
    model=model,
    args=dp_training_args,
    train_dataset=toked["train"],
    # eval_dataset=toked["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    privacy_args=privacy_args,

)
evaluation_callback.set_trainer(trainer)
trainer.add_callback(evaluation_callback)
trainer.add_callback(SaveScoreCallback(model, trainer))
# ignore_keys = getattr(trainer.model._module.config, "keys_to_ignore_at_inference", [])
trainer.train()

trainer.save_model()


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 152/152 [01:45<00:00,  1.44it/s]


validation_loss: 2.225940704345703
        validation_acc: 0.36043956875801086
        epoch: 0




Step,Training Loss
1,2.4449


In [None]:
print(f"prv epsilon: {trainer.get_prv_epsilon()}")
print(f"rpd epsilon: {trainer.get_rdp_epsilon()}")