In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd

## Data Preprocessing

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.prompt[0]

'["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]'

In [4]:
df.response_a[0]

'["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\\n\\nHere are some arguments in favor of and against such policies:\\n\\n**Arguments in favor:**\\n\\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\\n\\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\\n\\n3. **Equality of Opportunity:** Setting targets for female representation in management can help ensure that women have equal opportunities to advance in their careers.\\n\\n4. **R

In [5]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    if row['winner_model_a'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_a']
        new_row['response_a'] = row['prompt']
        new_rows.append(new_row)
    if row['winner_model_b'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_b']
        new_row['response_b'] = row['prompt']
        new_rows.append(new_row)

# Create a DataFrame from the new rows
new_df = pd.DataFrame(new_rows)

In [6]:
df_expanded = pd.concat([df, new_df], ignore_index=True)

In [7]:
df_expanded.to_csv("train_reversal.csv", index=False)

In [8]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df_expanded.iterrows():
    new_row = row.copy()
    new_row['response_a'] = row['response_b']
    new_row['response_b'] = row['response_a']
    new_row['winner_model_b'] = row['winner_model_a']
    new_row['winner_model_a'] = row['winner_model_b']    
    new_rows.append(new_row)

df_swap = pd.DataFrame(new_rows)

In [9]:
df_expanded_swap = pd.concat([df_swap, df_expanded], ignore_index=True)

In [10]:
def get_label(row):
    if row["winner_tie"] == 1:
        return 0
    elif row["winner_model_a"] == 1:
        return 1
    return 2

In [11]:
df_expanded_swap["text"] = df_expanded_swap.apply(lambda x: [x.response_a, x.response_b], axis=1)
df_expanded_swap["label"] = df_expanded_swap.apply(get_label, axis=1)


In [12]:
X = df_expanded_swap[["text"]]
y = df_expanded_swap[["label"]]

In [13]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [14]:
X_test, X_eval, y_test, y_eval = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42, stratify=y_dev)


In [15]:
#model_path = 'microsoft/deberta-v3-small'
model_path = 'google/gemma-2b'
#model_path = 'state-spaces/mamba-130m-hf'
tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=True)


In [16]:
tokenizer.special_tokens_map_extended

{'bos_token': AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'eos_token': AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}

In [17]:
tokenizer.additional_special_tokens_ids

[106, 107]

In [18]:
hf_dataset_train = Dataset.from_pandas(X_train)
hf_dataset_eval = Dataset.from_pandas(X_eval)

In [19]:
tokenizer(' <pad> '.join(hf_dataset_eval['text'][0]))

{'input_ids': [2, 3681, 11576, 235248, 235315, 19734, 235274, 235274, 671, 5580, 3356, 3595, 235307, 235248, 0, 10890, 235285, 749, 780, 791, 476, 59232, 13856, 611, 674, 235265, 2456, 708, 1767, 31629, 26387, 1105, 573, 235248, 235315, 19734, 235274, 235274, 48995, 15502, 235269, 578, 573, 2506, 5820, 1721, 780, 148853, 10244, 689, 213330, 11023, 44353, 604, 573, 5562, 576, 674, 1744, 163394, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
dataset_train = hf_dataset_train.map(lambda x: tokenizer(' <eos> '.join(x['text']), truncation=False), num_proc=10)
dataset_eval = hf_dataset_eval.map(lambda x: tokenizer(' <eos> '.join(x['text']), truncation=False), num_proc=10)

Map (num_proc=10):   0%|          | 0/212344 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/26543 [00:00<?, ? examples/s]

In [21]:
dataset_eval["input_ids"][0].count(1)

2

In [22]:
dataset_eval["input_ids"][0][-3:]

[1744, 163394, 1]

In [23]:
dataset_train = dataset_train.remove_columns(['text', '__index_level_0__'])

In [24]:
dataset_eval = dataset_eval.remove_columns(['text', '__index_level_0__'])

In [25]:
dataset_train = dataset_train.add_column('labels', y_train['label'])

In [26]:
dataset_eval = dataset_eval.add_column('labels', y_eval['label'])

## Training

In [38]:
dataset_eval.save_to_disk(f"kaggle_eval_{model_path.split()[-1]}")
dataset_train.save_to_disk(f"kaggle_train_{model_path.split()[-1]}")

Saving the dataset (0/1 shards):   0%|          | 0/26543 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/212344 [00:00<?, ? examples/s]

In [39]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

In [28]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [41]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [30]:
#model_path = 'microsoft/deberta-v3-small'
model_path = 'google/gemma-2b'
#model_path = 'state-spaces/mamba-130m-hf'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 3, device_map='auto', quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
model = prepare_model_for_kbit_training(model)

In [42]:

model = get_peft_model(model, lora_config)

In [43]:
dataset_eval = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")
dataset_train = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")

In [44]:
def preprocess_logits(predictions, labels):
    return torch.argmax(predictions, axis=-1)

In [45]:
def compute_metrics(model_output):
    metric_f1 = evaluate.load('f1')

    predictions, references = model_output

    return metric_f1.compute(predictions=predictions, references=references, average='macro')


In [46]:
trainer_params = {
    'lr_scheduler_type' : 'linear',
    'optim' : 'adamw_torch',
    'save_strategy' : 'no',
    'evaluation_strategy': 'epoch',
    'output_dir': 'model/',
    'overwrite_output_dir' : True,
    'learning_rate' : 1e-5,
    'num_train_epochs' : 4,
    'weight_decay' : 0.01,
    'per_device_train_batch_size': 1,
    'per_device_eval_batch_size': 1,
    'warmup_ratio': 0.1,
    'push_to_hub': False,
    'fp16' : True,
    'report_to' : 'none',
    'gradient_accumulation_steps' : 1, # Esto por si peta por batchsize
}

training_args = TrainingArguments(**trainer_params)

In [47]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train, # TODO
    eval_dataset = dataset_eval, # TODO
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits,
    data_collator = DataCollatorWithPadding(tokenizer), # TODO: tokenizer missing
)

In [48]:
trainer_output = trainer.train()

  0%|          | 0/106172 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


# MAMBO

In [None]:
from mamba.head import MambaClassificationHead
from mamba.model import MambaTextClassification
from utils.utils import preprocess_function, compute_metrics
from mamba.trainer import MambaTrainer

import os
import random
import numpy as np
from huggingface_hub import login
from datasets import load_dataset
from transformers import Trainer
from transformers import AutoTokenizer, TrainingArguments