In [38]:
!sh train.sh

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aiotlab3/.cache/huggingface/token
Login successful


In [39]:
from datasets import Dataset, DatasetDict

train_dataset = pd.read_csv('../train.csv')
valid_dataset = pd.read_csv('../val.csv')
test_dataset = pd.read_csv('../test.csv')
    
dataset = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [40]:
labels = set()
for label_list in dataset['train']['labels']:
    for label in label_list.split():
        labels.add(label)

for label_list in dataset['validation']['labels']:
    for label in label_list.split():
        labels.add(label)

for label_list in dataset['test']['labels']:
    for label in label_list.split():
        labels.add(label)

labels = sorted(list(labels))  
id2label = {idx : label for idx, label in enumerate(labels)}
label2id = {label : idx for idx, label in enumerate(labels)}

In [41]:
def labels_to_one_hot(label_list, label2id):
    num_labels = len(label2id)
    one_hot = np.zeros(num_labels)
    for label in label_list.split():
        label_id = label2id[label]
        one_hot[label_id] = 1
    return one_hot

In [42]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

In [43]:
import torch
from torch.utils.data import dataset
from datasets import load_dataset, load_metric

class MultiLabelDataset(pd.DataFrame):
    def __init__(self, 
                 df=None, 
                 tokenizer=None, 
                 max_length=1024, 
                 max_samples=None,
                 is_test=False
        ):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        
        if max_samples is not None:
            self.df = self.df.sample(n=max_samples, random_state=42).reset_index(drop=True)

    def __len__(self):
        return len(self.df)


    def __getitem__(self, index):
        text = self.df.loc[index, "text"]
        text = text.replace('</s>','<s>')
        labels = self.df.loc[index, "labels"]
        # process caption
        model_inputs = self.tokenizer(
            text, 
            max_length=self.max_length,  
            truncation=True,
        )
        model_inputs = {k : v[0] for k,v in model_inputs.items()}
        
        num_labels = len(label2id)
        one_hot = np.zeros(num_labels)
        for label in labels.split():
            label_id = label2id[label]
            one_hot[label_id] = 1
        
        model_inputs["labels"] = torch.tensor(one_hot, dtype=torch.float)
        
        return model_inputs

In [44]:
train_ds = MultiLabelDataset(
    df=train_dataset,
    tokenizer=tokenizer
)
val_ds = MultiLabelDataset(
    df=valid_dataset,
    tokenizer=tokenizer
)
test_ds = MultiLabelDataset(
    df=test_dataset,
    tokenizer=tokenizer
)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/aiotlab3/anaconda3/envs/readsum/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_18223/2280212892.py", line 1, in <module>
    train_ds = MultiLabelDataset(
  File "/tmp/ipykernel_18223/3262584881.py", line 13, in __init__
    self.df = df.reset_index(drop=True)
  File "/home/aiotlab3/anaconda3/envs/readsum/lib/python3.9/site-packages/pandas/core/generic.py", line 6230, in __setattr__
    existing = getattr(self, name)
  File "/home/aiotlab3/anaconda3/envs/readsum/lib/python3.9/site-packages/pandas/core/generic.py", line 6201, in __getattr__
    and self._info_axis._can_hold_identifiers_and_holds_name(name)
  File "/home/aiotlab3/anaconda3/envs/readsum/lib/python3.9/site-packages/pandas/core/generic.py", line 6201, in __getattr__
    and self._info_axis._can_hold_identifiers_and_holds_name(name)
  File "/home/aiotlab3/anacond

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir = '../checkpoints',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb",
    run_name='bart-rec'
)

In [None]:
import torch
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

def multi_label_metrics(predictions, labels, threshold=0.2):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()