In [None]:
!pip install transformers accelerate datasets "ray[air]>=1.13"

In [None]:
!pip install sentencepiece ray mlflow py7zr

In [None]:
!pip install ipywidgets>=8 evaluate rouge_score jsonlines

: 

In [1]:
from huggingface_hub import notebook_login, HfFolder

notebook_login()

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /future/u/atemjohn/home/.cache/huggingface/token
Login successful


In [2]:
import torch
import pandas as pd
import mlflow
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [3]:
device = "cuda" if torch.cuda.is_available else "cpu"

### Setup Ray

In [4]:
from pprint import pprint
import ray

if not ray.is_initialized():
  ray.init()

2023-04-03 10:22:57,140	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [5]:
pprint(ray.cluster_resources())

{'CPU': 256.0,
 'GPU': 8.0,
 'accelerator_type:A100': 1.0,
 'memory': 735364656128.0,
 'node:172.24.75.20': 1.0,
 'object_store_memory': 200000000000.0}


### Load Dataset

In [6]:
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict

In [7]:
dataset_folder = "./" #"/content/drive/MyDrive/colab/datasets/summarization/comparisons"

In [8]:
val_dataset = load_dataset("json", data_files={"validation": f"{dataset_folder}/tldr_validation.json"})["validation"]
train_dataset = Dataset.from_pandas(ray.data.read_json(f"{dataset_folder}/tldr_train.json").to_pandas())

raw_datasets = DatasetDict(
    {
        "train": train_dataset,  # .shuffle().select(range(50000)),
        "valid": val_dataset,  # .shuffle().select(range(500))
    }
)

Found cached dataset json (/future/u/atemjohn/home/.cache/huggingface/datasets/json/default-832f6b7e05bb2241/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Read progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:23<00:00, 23.38s/it]
Read progress: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 860.02it/s]


In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['info', 'split', 'summaries', 'choice', 'worker', 'batch', 'extra'],
        num_rows: 92858
    })
    valid: Dataset({
        features: ['info', 'split', 'summaries', 'choice', 'worker', 'batch', 'extra'],
        num_rows: 50715
    })
})

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BloomModel

model_checkpoint = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [12]:
# model = BloomModel.from_pretrained(model_checkpoint, torch_dtype="auto", device_map="auto")

### Preprocess Dataset

In [13]:
max_length = 512

In [14]:
import pandas as pd
import numpy as np

def preprocess_function(samples):

    infos = list(samples["info"].values)
    summaries = list(samples["summaries"].values)
    choices = list(samples["choice"].values)

    anchors = [info["post"] for info in infos]
    positives = []
    negatives = []

    for i, values in enumerate(summaries):
        positives.append(values[choices[i]]["text"])
        negatives.append(values[choices[i]-1]["text"])

    return pd.DataFrame.from_dict({})


def contrastive_preprocess_function(samples):
    
    infos = samples["info"]
    summaries = samples["summaries"]
    choices = samples["choice"]

    anchors = [info["post"] for info in infos]
    positives = []
    negatives = []

    for i, values in enumerate(summaries):
        positives.append(values[choices[i]]["text"])
        negatives.append(values[choices[i]-1]["text"])

    anch_tokens = tokenizer(anchors, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    pos_tokens = tokenizer(positives, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    neg_tokens = tokenizer(negatives, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")

    model_inputs = {
        "anchor": anch_tokens.input_ids,
        "anchor_mask": anch_tokens.attention_mask,
        "positive": pos_tokens.input_ids,
        "positive_mask": pos_tokens.attention_mask,
        "negative": neg_tokens.input_ids,
        "negative_mask": neg_tokens.attention_mask
    }

    return model_inputs

In [15]:
raw_datasets = raw_datasets.map(
    contrastive_preprocess_function, batched=True, num_proc=4
).remove_columns(["info", "split", "summaries", "choice", "worker", "batch", "extra"])

Map (num_proc=4):   0%|          | 0/92858 [00:00<?, ? examples/s]

Loading cached processed dataset at /future/u/atemjohn/home/.cache/huggingface/datasets/json/default-832f6b7e05bb2241/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-5ce8311aa4eb4a66_*_of_00004.arrow


### Setup Training Loop

In [16]:
model_name = "bloomz-contrastive-finetuned"
finetuned_model_name = "bloomz-finetuned"

In [1]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    """
    Remember to use AutoModel for generation here
    """

    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    print(preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["rouge"] = result_rouge['rougeL'].mid.fmeasure
    result["gen_len"] = np.mean(prediction_lens)
    result["meteor"] = meteor_result["meteor"]
    result = {k: round(v, 4) for k, v in result.items()}
    return result
    
    return result


[nltk_data] Downloading package punkt to
[nltk_data]     /future/u/atemjohn/home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


AttributeError: module 'evaluate' has no attribute 'load'

In [18]:
from transformers import Trainer

class ContrastiveLossFinetune(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
      negatives = inputs["negative"].to(device)
      positives = inputs["positive"].to(device)
      anchors = inputs["anchor"].to(device)
      neg_mask = inputs["negative_mask"].to(device)
      pos_mask = inputs["positive_mask"].to(device)
      anch_mask = inputs["anchor_mask"].to(device)

      # forward pass
      anchor_outs =  F.normalize(model(anchors, attention_mask=anch_mask, output_hidden_states=True).last_hidden_state[:, -1, :], dim=1)
      negatives_outs = F.normalize(model(negatives, attention_mask=anch_mask, output_hidden_states=True).last_hidden_state[:, -1, :], dim=1)
      positives_outs = F.normalize(model(positives, attention_mask=anch_mask, output_hidden_states=True).last_hidden_state[:, -1, :], dim=1)

      #compute scores
      scores = (anchor_outs @ positives_outs.T) * torch.exp(torch.tensor(0.07))

      # compute custom loss (suppose one has 3 labels with different weights)
      labels = torch.arange(anchor_outs.size()[0], dtype=torch.long).to(device)

      loss = F.cross_entropy(scores, labels)

      return loss

class RegularFinetuner(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
      labels = inputs["labels"].to(device)
      input_ids = inputs["input_ids"].to(device)
      attention_mask = inputs["attention_mask"].to(device)

      labels = torch.tensor([
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels
      ]).to(device)

      # forward pass
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs.loss

      return (loss, outputs) if return_outputs else loss


In [44]:
from transformers import AutoTokenizer, TrainingArguments, BloomModel
from huggingface_hub import HfFolder
from torch import nn

batch_size = 8 #32

# Hugging Face repository id
repository_id = model_name

args = TrainingArguments(
    output_dir=repository_id,
    evaluation_strategy="epoch",
    save_strategy="epoch",

    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,

    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    num_train_epochs=1,
    remove_unused_columns=False,
    weight_decay=0.01,
    disable_tqdm=True,  # declutter the output a little

    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token()
)

TypeError: __init__() got an unexpected keyword argument 'predict_with_generate'

In [None]:
contrastive_trainer = ContrastiveLossFinetuner(
    model,
    args,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["valid"],
)
result = contrastive_trainer.train()

In [53]:
from transformers import BloomForCausalLM

casual_lm = BloomForCausalLM.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at /future/u/atemjohn/home/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/25f241f41c04f08d658a1dd3b49ad41390109a8e/config.json
Model config BloomConfig {
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "seq_length": 2048,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "transformers_version": "4.26.1",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

loading weights file pytorch_model.bin from c

In [21]:
raw_datasets_2 = raw_datasets
raw_datasets_2 = raw_datasets_2.remove_columns(["negative", "negative_mask", "positive_mask"])

In [22]:
raw_datasets_2

DatasetDict({
    train: Dataset({
        features: ['anchor', 'anchor_mask', 'positive'],
        num_rows: 92858
    })
    valid: Dataset({
        features: ['anchor', 'anchor_mask', 'positive'],
        num_rows: 50715
    })
})

In [23]:
raw_datasets_2 = raw_datasets_2.rename_column("anchor", "input_ids")
raw_datasets_2 = raw_datasets_2.rename_column("positive", "labels")
raw_datasets_2 = raw_datasets_2.rename_column("anchor_mask", "attention_mask")

In [48]:
raw_validation = raw_datasets_2["valid"]

In [49]:
raw_validation = raw_validation.train_test_split(test_size = 0.002)

In [50]:
raw_validation

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50613
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

In [54]:
# Hugging Face repository id
repository_id = finetuned_model_name

args_2 = TrainingArguments(
    output_dir=repository_id,
    evaluation_strategy="steps",
    save_strategy="steps",

    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,

    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1,

    num_train_epochs=1,
    remove_unused_columns=False,
    weight_decay=0.01,
    disable_tqdm=True,  # declutter the output a little

    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token()
)

trainer = RegularFinetuner(
    casual_lm,
    args_2,
    train_dataset=raw_validation["train"],
    eval_dataset=raw_validation["test"],
    compute_metrics=compute_metrics
)

using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices


In [55]:
eval_result = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 102
  Batch size = 64


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [42]:
import gc
gc.collect()

1961

In [43]:
torch.cuda.empty_cache()