In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd

## Data Preprocessing

In [11]:
df = pd.read_csv("train.csv")

In [12]:
df.prompt[0]

'["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]'

In [13]:
df.response_a[0]

'["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\\n\\nHere are some arguments in favor of and against such policies:\\n\\n**Arguments in favor:**\\n\\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\\n\\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\\n\\n3. **Equality of Opportunity:** Setting targets for female representation in management can help ensure that women have equal opportunities to advance in their careers.\\n\\n4. **R

In [14]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    if row['winner_model_a'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_a']
        new_row['response_a'] = row['prompt']
        new_rows.append(new_row)
    if row['winner_model_b'] == 1 or row['winner_tie'] == 1:
        new_row = row.copy()
        new_row['prompt'] = row['response_b']
        new_row['response_b'] = row['prompt']
        new_rows.append(new_row)

# Create a DataFrame from the new rows
new_df = pd.DataFrame(new_rows)

In [16]:
df_expanded = pd.concat([df, new_df], ignore_index=True)

In [17]:
df_expanded.to_csv("train_reversal.csv", index=False)

In [18]:
new_rows = []

# Iterate over the DataFrame rows
for index, row in df_expanded.iterrows():
    new_row = row.copy()
    new_row['response_a'] = row['response_b']
    new_row['response_b'] = row['response_a']
    new_row['winner_model_b'] = row['winner_model_a']
    new_row['winner_model_a'] = row['winner_model_b']    
    new_rows.append(new_row)

df_swap = pd.DataFrame(new_rows)

In [19]:
df_expanded_swap = pd.concat([df_swap, df_expanded], ignore_index=True)

In [20]:
def get_label(row):
    if row["winner_tie"] == 1:
        return 0
    elif row["winner_model_a"] == 1:
        return 1
    return 2

In [21]:
df_expanded_swap["text"] = df_expanded_swap.apply(lambda x: [x.response_a, x.response_b], axis=1)
df_expanded_swap["label"] = df_expanded_swap.apply(get_label, axis=1)


In [22]:
X = df_expanded_swap[["text"]]
y = df_expanded_swap[["label"]]

In [23]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [24]:
X_test, X_eval, y_test, y_eval = train_test_split(X_dev, y_dev, test_size=0.5, random_state=42, stratify=y_dev)


In [25]:
#model_path = 'microsoft/deberta-v3-small'
model_path = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model_path)


In [30]:
tokenizer.sep_token_id

102

In [26]:
hf_dataset_train = Dataset.from_pandas(X_train)
hf_dataset_eval = Dataset.from_pandas(X_eval)

In [27]:
dataset_train = hf_dataset_train.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), num_proc=10, batched=True)
dataset_eval = hf_dataset_eval.map(lambda x: tokenizer(x['text'], truncation=True, max_length=512), batched=True, num_proc=10)

Map (num_proc=10):   0%|          | 0/212344 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/26543 [00:00<?, ? examples/s]

In [31]:
dataset_eval["input_ids"][0].count(102)

2

In [None]:
dataset_train = dataset_train.remove_columns(['text', '__index_level_0__'])

In [33]:
dataset_eval = dataset_eval.remove_columns(['text', '__index_level_0__'])

In [34]:
dataset_train = dataset_train.add_column('labels', y_train['label'])

In [35]:
dataset_eval = dataset_eval.add_column('labels', y_eval['label'])

## Training

In [36]:
dataset_eval.save_to_disk("kaggle_eval_trunc")
dataset_train.save_to_disk("kaggle_train_trunc")

Saving the dataset (0/1 shards):   0%|          | 0/26543 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/212344 [00:00<?, ? examples/s]

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_from_disk
from sklearn.model_selection import train_test_split
import torch
import evaluate
import pandas as pd

In [2]:
dataset_eval = load_from_disk("kaggle_eval_trunc")
dataset_train = load_from_disk("kaggle_train_trunc")

In [3]:
model_path = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 3, device_map='cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def preprocess_logits(predictions, labels):
    return torch.argmax(predictions, axis=-1)

In [6]:
def compute_metrics(model_output):
    metric_f1 = evaluate.load('f1')

    predictions, references = model_output

    return metric_f1.compute(predictions=predictions, references=references, average='macro')


In [7]:
trainer_params = {
    'lr_scheduler_type' : 'linear',
    'optim' : 'adamw_torch',
    'save_strategy' : 'no',
    'evaluation_strategy': 'epoch',
    'output_dir': 'model/',
    'overwrite_output_dir' : True,
    'learning_rate' : 8e-5,
    'num_train_epochs' : 4,
    'weight_decay' : 0.01,
    'per_device_train_batch_size': 96,
    'per_device_eval_batch_size': 96,
    'warmup_ratio': 0.1,
    'push_to_hub': False,
    'fp16' : True,
    'report_to' : 'none',
    'gradient_accumulation_steps' : 1, # Esto por si peta por batchsize
}

training_args = TrainingArguments(**trainer_params)



In [8]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset_train, # TODO
    eval_dataset = dataset_eval, # TODO
    compute_metrics = compute_metrics,
    preprocess_logits_for_metrics = preprocess_logits,
    data_collator = DataCollatorWithPadding(tokenizer) # TODO: tokenizer missing
)

In [9]:
trainer_output = trainer.train()

  0%|          | 0/8848 [00:00<?, ?it/s]

KeyboardInterrupt: 