In [1]:
!pip install -q scikit-multilearn
!pip install -q datasets
!pip install -q peft==0.8.2
!pip install accelerate
!pip install -U bitsandbytes



In [2]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)


def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs


# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d


# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss



In [3]:

# set random seed
random.seed(0)

# load data
with open('/content/train.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:]) for row in data]))

num_examples = 15
idx = idx[0:num_examples]
text = text[0:num_examples]
labels = labels[0:num_examples]

labels = np.array(labels, dtype=int)

# create label weights
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# stratified train test split for multilabel ds
row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]




## Ejemplo de texto y clase

In [4]:
x_val[0]

'Title: Deep Structured Generative Models\n\nAbstract: Deep generative models have shown promising results in generating realistic\nimages, but it is still non-trivial to generate images with complicated\nstructures. The main reason is that most of the current generative models fail\nto explore the structures in the images including spatial layout and semantic\nrelations between objects. To address this issue, we propose a novel deep\nstructured generative model which boosts generative adversarial networks (GANs)\nwith the aid of structure information. In particular, the layout or structure\nof the scene is encoded by a stochastic and-or graph (sAOG), in which the\nterminal nodes represent single objects and edges represent relations between\nobjects. With the sAOG appropriately harnessed, our model can successfully\ncapture the intrinsic structure in the scenes and generate images of\ncomplicated scenes accordingly. Furthermore, a detection network is introduced\nto infer scene struct

In [5]:
y_val[0]

array([0, 0, 0, 1, 0, 0])

In [6]:
# create hf dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

# model name
model_name = 'mistralai/Mistral-7B-v0.1'

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 8, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [7]:

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8, # tested with 16gb gpu ram
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [9]:

# train
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted
1,No log,1.405188,0.0,0.0,0.0
2,No log,0.610935,0.0,0.0,0.0
3,No log,0.446498,0.0,0.0,0.0
4,No log,0.48585,0.0,0.0,0.0
5,No log,0.540224,0.4,0.111111,0.333333
6,No log,0.666552,0.4,0.111111,0.333333
7,No log,0.77697,0.4,0.111111,0.333333
8,No log,0.794268,0.4,0.111111,0.333333
9,No log,0.800468,0.4,0.111111,0.333333


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted
1,No log,1.405188,0.0,0.0,0.0
2,No log,0.610935,0.0,0.0,0.0
3,No log,0.446498,0.0,0.0,0.0
4,No log,0.48585,0.0,0.0,0.0
5,No log,0.540224,0.4,0.111111,0.333333
6,No log,0.666552,0.4,0.111111,0.333333
7,No log,0.77697,0.4,0.111111,0.333333
8,No log,0.794268,0.4,0.111111,0.333333
9,No log,0.800468,0.4,0.111111,0.333333
10,No log,0.807556,0.4,0.111111,0.333333


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


TrainOutput(global_step=20, training_loss=0.29776930809020996, metrics={'train_runtime': 703.0307, 'train_samples_per_second': 0.185, 'train_steps_per_second': 0.028, 'total_flos': 2258580523253760.0, 'train_loss': 0.29776930809020996, 'epoch': 10.0})

In [10]:

# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)




('multilabel_mistral/tokenizer_config.json',
 'multilabel_mistral/special_tokens_map.json',
 'multilabel_mistral/tokenizer.model',
 'multilabel_mistral/added_tokens.json',
 'multilabel_mistral/tokenizer.json')

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
# load model
peft_model_id = 'multilabel_mistral'
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id, device_map="cuda:0")

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# Texto de ejemplo para probar el modelo
sample_text = "Este es un ejemplo de texto para la prueba del modelo."

# Tokenizar el texto
inputs = tokenizer(sample_text, return_tensors="pt")

# Realizar la inferencia
outputs = model(**inputs)

# Obtener las predicciones
predictions = torch.sigmoid(outputs.logits)

# Mostrar las predicciones
print(predictions)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 157.06 MiB is free. Process 223896 has 14.59 GiB memory in use. Of the allocated memory 14.30 GiB is allocated by PyTorch, and 163.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF