In [50]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader


In [51]:
raw_datasets = load_dataset('glue','sst2')
# model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [9]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
task_name = 'sst2'
sentence1_key, sentence2_key = task_to_keys[task_name]
def preprocess_function(examples):
        # Tokenize the texts
        texts = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*texts, padding=False, max_length=128, truncation=True)

        if "label" in examples:
            # In all cases, rename the column to labels because the model will expect that.
            result["labels"] = examples["label"]
        return result

processed_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=[c for c in raw_datasets["train"].column_names if c != 'idx'],  # 保留idx，其他的可以去掉
            desc="Running tokenizer on dataset",
        )
processed_datasets

DatasetDict({
    train: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1821
    })
})

In [10]:
processed_datasets['train'][0]

{'idx': 0,
 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': 0}

In [15]:
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(processed_datasets['train'], shuffle=True, collate_fn=data_collator, batch_size=8)

In [26]:
for batch in train_dataloader:
    pass
# batch

In [25]:
sample = {k:v for k,v in batch.items() if k != 'idx'} # remove the 'idx' column when passing to the model
sample
model(**sample)

SequenceClassifierOutput(loss=tensor(0.7099, grad_fn=<NllLossBackward>), logits=tensor([[-0.0429,  0.1558],
        [-0.0502,  0.1516],
        [-0.0003,  0.1548],
        [-0.0066,  0.1364],
        [-0.0312,  0.1416]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [34]:
batch['idx'].tolist(), batch['labels'].tolist()

([34990, 1629, 3846, 53871, 46015], [0, 0, 0, 1, 0])

In [33]:
model(**sample).logits.tolist()

[[-0.042868416756391525, 0.15579353272914886],
 [-0.050198763608932495, 0.1516065001487732],
 [-0.0002577579580247402, 0.15483489632606506],
 [-0.00658339262008667, 0.136440709233284],
 [-0.031188856810331345, 0.14158132672309875]]

In [2]:
from selection_utils import read_training_dynamics

In [3]:
"""
得到的 training_dynamics 是一个字典，key就是sample id
例如 training_dynamics[0]:
{'gold': 0,
 'logits': [[1.4952679872512817, -1.5543408393859863],
  [1.5071253776550293, -1.2763938903808594],
  [1.8439220190048218, -1.7771254777908325]]}
"""
training_dynamics = read_training_dynamics('dy_log/sst2/distilbert-base-cased')

2022-07-05 15:22:48,969 - INFO - selection_utils - Reading 10 files from dy_log/sst2/distilbert-base-cased/training_dynamics ...
  0%|          | 0/10 [00:00<?, ?it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_0.jsonl


 20%|██        | 2/10 [00:00<00:03,  2.15it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_1.jsonl
*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_2.jsonl


 30%|███       | 3/10 [00:01<00:03,  2.23it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_3.jsonl


 40%|████      | 4/10 [00:01<00:02,  2.42it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_4.jsonl


 50%|█████     | 5/10 [00:02<00:02,  2.29it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_5.jsonl


 60%|██████    | 6/10 [00:02<00:01,  2.45it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_6.jsonl


 70%|███████   | 7/10 [00:03<00:01,  2.23it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_7.jsonl


 80%|████████  | 8/10 [00:03<00:00,  2.35it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_8.jsonl


 90%|█████████ | 9/10 [00:03<00:00,  2.41it/s]

*** Current Reading: dy_log/sst2/distilbert-base-cased/training_dynamics/dynamics_epoch_9.jsonl


100%|██████████| 10/10 [00:04<00:00,  2.26it/s]
2022-07-05 15:22:53,410 - INFO - selection_utils - Read training dynamics for 67349 train instances.
