In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd

## Data Preprocessing

In [16]:
df = pd.read_csv("train.csv")

In [17]:
df.prompt[0]

'["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]'

In [18]:
df.response_a[0]

'["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\\n\\nHere are some arguments in favor of and against such policies:\\n\\n**Arguments in favor:**\\n\\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\\n\\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\\n\\n3. **Equality of Opportunity:** Setting targets for female representation in management can help ensure that women have equal opportunities to advance in their careers.\\n\\n4. **R

In [19]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    if row['winner_model_a'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_a']
        new_row['response_a'] = row['prompt']
        new_rows.append(new_row)
    if row['winner_model_b'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_b']
        new_row['response_b'] = row['prompt']
        new_rows.append(new_row)

# Create a DataFrame from the new rows
new_df = pd.DataFrame(new_rows)

In [20]:
df_expanded = pd.concat([df, new_df], ignore_index=True)

In [21]:
df_expanded.to_csv("train_reversal.csv", index=False)

In [22]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df_expanded.iterrows():
    new_row = row.copy()
    new_row['response_a'] = row['response_b']
    new_row['response_b'] = row['response_a']
    new_row['winner_model_b'] = row['winner_model_a']
    new_row['winner_model_a'] = row['winner_model_b']    
    new_rows.append(new_row)

df_swap = pd.DataFrame(new_rows)

In [23]:
df_expanded_swap = pd.concat([df_swap, df_expanded], ignore_index=True)

In [24]:
def get_label(row):
    if row["winner_tie"] == 1:
        return 0
    elif row["winner_model_a"] == 1:
        return 1
    return 2

In [25]:
df_expanded_swap["text"] = df_expanded_swap.apply(lambda x: [x.response_a, x.response_b], axis=1)
df_expanded_swap["label"] = df_expanded_swap.apply(get_label, axis=1)


In [None]:
X = df_expanded_swap[["text"]]
y = df_expanded_swap[["label"]]

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
X_test, X_eval, y_test, y_eval = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42, stratify=y_dev)


In [None]:
#model_path = 'microsoft/deberta-v3-small'
#model_path = 'google/gemma-2b'
model_path = 'state-spaces/mamba-130m-hf'
tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=True)


In [None]:
tokenizer.special_tokens_map_extended

In [None]:
tokenizer.additional_special_tokens_ids

In [None]:
hf_dataset_train = Dataset.from_pandas(X_train)
hf_dataset_eval = Dataset.from_pandas(X_eval)

In [14]:
hf_dataset_eval = Dataset.from_pandas(X_test)

NameError: name 'Dataset' is not defined

In [None]:
tokenizer(' <|endoftext|> '.join(hf_dataset_eval['text'][0]))

In [None]:
dataset_train = hf_dataset_train.map(lambda x: tokenizer(' <|endoftext|> '.join(x['text']), truncation=False), num_proc=10)
dataset_eval = hf_dataset_eval.map(lambda x: tokenizer(' <|endoftext|> '.join(x['text']), truncation=False), num_proc=10)

In [None]:
dataset_eval["input_ids"][0].count(1)

In [None]:
dataset_eval["input_ids"][0][-3:]

In [None]:
dataset_train = dataset_train.remove_columns(['text', '__index_level_0__'])

In [None]:
dataset_eval = dataset_eval.remove_columns(['text', '__index_level_0__'])

In [None]:
dataset_train = dataset_train.add_column('labels', y_train['label'])

In [None]:
dataset_eval = dataset_eval.add_column('labels', y_eval['label'])

## Training

In [None]:
dataset_eval.save_to_disk(f"kaggle_eval_{model_path.split()[-1]}")
dataset_train.save_to_disk(f"kaggle_train_{model_path.split()[-1]}")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

In [None]:
#model_path = 'microsoft/deberta-v3-small'
model_path = 'google/gemma-2b'
#model_path = 'state-spaces/mamba-130m-hf'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 3, device_map='auto', quantization_config=bnb_config)

In [None]:
model = prepare_model_for_kbit_training(model)

In [None]:

model = get_peft_model(model, lora_config)

In [None]:
dataset_eval = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")
dataset_train = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")

In [None]:
def preprocess_logits(predictions, labels):
    return torch.argmax(predictions, axis=-1)

In [None]:
def compute_metrics(model_output):
    metric_f1 = evaluate.load('f1')

    predictions, references = model_output

    return metric_f1.compute(predictions=predictions, references=references, average='macro')


In [None]:
trainer_params = {
    'lr_scheduler_type' : 'linear',
    'optim' : 'adamw_torch',
    'save_strategy' : 'no',
    'evaluation_strategy': 'epoch',
    'output_dir': 'model/',
    'overwrite_output_dir' : True,
    'learning_rate' : 1e-5,
    'num_train_epochs' : 4,
    'weight_decay' : 0.01,
    'per_device_train_batch_size': 1,
    'per_device_eval_batch_size': 1,
    'warmup_ratio': 0.1,
    'push_to_hub': False,
    'fp16' : True,
    'report_to' : 'none',
    'gradient_accumulation_steps' : 1, # Esto por si peta por batchsize
}

training_args = TrainingArguments(**trainer_params)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train, # TODO
    eval_dataset = dataset_eval, # TODO
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits,
    data_collator = DataCollatorWithPadding(tokenizer), # TODO: tokenizer missing
)

In [None]:
trainer_output = trainer.train()

In [2]:
!pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Using cached widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Using cached ipywidgets-8.1.2-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
Using cached widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.2 jupyterlab-widgets-3.0.10 widgetsnbextension-4.0.10


# MAMBO

In [1]:
from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM
import torch
from hf_mamba_classification import MambaForSequenceClassification
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset, load_from_disk

In [2]:
model_path = 'state-spaces/mamba-130m-hf'

In [3]:
num_labels = 3  # the number of labels

In [4]:

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MambaForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=num_labels, 
    use_cache=False  # This needs to be passed when using eval and training Mamba for sequence classification otherwise it will raise an error
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of MambaForSequenceClassification were not initialized from the model checkpoint at state-spaces/mamba-130m-hf and are newly initialized: ['classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
dataset_eval = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")
dataset_train = load_from_disk(f"kaggle_train_{model_path.split()[-1]}")
dataset_train = load_from_disk(f"kaggle_eval_{model_path.split()[-1]}")


In [7]:
def preprocess_logits(predictions, labels):
    return torch.argmax(predictions, axis=-1)

In [8]:
def compute_metrics(model_output):
    metric_f1 = evaluate.load('f1')

    predictions, references = model_output

    return metric_f1.compute(predictions=predictions, references=references, average='macro')


In [9]:
trainer_params = {
    'lr_scheduler_type' : 'linear',
    'optim' : 'adamw_torch',
    'save_strategy' : 'no',
    'evaluation_strategy': 'epoch',
    'output_dir': 'model/',
    'overwrite_output_dir' : True,
    'learning_rate' : 1e-5,
    'num_train_epochs' : 4,
    'weight_decay' : 0.01,
    'per_device_train_batch_size': 1,
    'per_device_eval_batch_size': 1,
    'warmup_ratio': 0.1,
    'push_to_hub': False,
    'fp16' : True,
    'report_to' : 'none',
    'gradient_accumulation_steps' : 1, # Esto por si peta por batchsize
}

training_args = TrainingArguments(**trainer_params)

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=1e-5
)


In [11]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train, # TODO
    eval_dataset = dataset_eval, # TODO
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits,
    data_collator = DataCollatorWithPadding(tokenizer), # TODO: tokenizer missing
)

In [12]:
trainer_output = trainer.train()

  0%|          | 0/19908 [00:00<?, ?it/s]

{'loss': 1.1003, 'grad_norm': 34.82256317138672, 'learning_rate': 9.996986136226643e-06, 'epoch': 0.0}
{'loss': 1.1005, 'grad_norm': 12.611607551574707, 'learning_rate': 9.991963029937714e-06, 'epoch': 0.0}
{'loss': 1.0993, 'grad_norm': 13.06774616241455, 'learning_rate': 9.986939923648786e-06, 'epoch': 0.0}
{'loss': 1.0861, 'grad_norm': 26.48930549621582, 'learning_rate': 9.981916817359856e-06, 'epoch': 0.01}
{'loss': 1.1095, 'grad_norm': 34.82231903076172, 'learning_rate': 9.976893711070927e-06, 'epoch': 0.01}
{'loss': 1.0887, 'grad_norm': 33.01508712768555, 'learning_rate': 9.971870604781998e-06, 'epoch': 0.01}
{'loss': 1.083, 'grad_norm': 24.58842658996582, 'learning_rate': 9.966847498493068e-06, 'epoch': 0.01}
{'loss': 1.1019, 'grad_norm': 35.88905715942383, 'learning_rate': 9.962326702833032e-06, 'epoch': 0.01}
{'loss': 1.0827, 'grad_norm': 32.37993240356445, 'learning_rate': 9.957303596544104e-06, 'epoch': 0.01}
{'loss': 1.0948, 'grad_norm': 31.899131774902344, 'learning_rate': 

In [27]:
results = trainer.evaluate(dataset_set)

# Print evaluation results
print(results)

KeyboardInterrupt: 