forked from alycialee/beyond-scale-language-data-diversity
/
div_coeff.py
776 lines (687 loc) · 38 KB
/
div_coeff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
"""
todo: wish list but not essential. I wish I could have passed the data set as a streaming data set, to avoid tokenizing all the
data set before doing anything with it. e.g., the data sets might be huge, especially for training/fine-tuning.
GPT4 suggested this: https://chat.openai.com/share/495de296-71c2-4f5e-83e2-3b22d038e8bc which seems reasonable.
It also would have made all the data set interfaces consistent in training vs computing data set metrics.
"""
import time
from pathlib import Path
import datetime
import json
from torch import nn
import numpy as np
import random
from datasets import load_dataset
from datasets import interleave_datasets
from diversity.task2vec import Task2Vec
import diversity.task_similarity as task_similarity
def get_diversity_coefficient(dataset,
map: callable, # to ease whatever ars you want to batch.map for any data set
probe_network: nn.Module,
tokenizer = None,
batch_size: int = 512,
num_batches: int = 600,
seed: int = 0,
buffer_size: int = 500_000,
distance = 'cosine',
verbose: bool = False,
debug: bool = False,
shuffle: bool = True, # False for faster debugging/testing but it won't be shuffled
) -> dict:
"""
Compute the diversity coefficient of a dataset using a probe network.
Return all results in a dictionary since it's often useful to store them to avoid recomputing them.
If you want the diveristy coefficient and it's confidence interval (ci), use the following:
div_coeff, div_coeff_ci = results['div_coeff'], results['div_coeff_ci']
"""
print(f'{shuffle=}')
if num_batches < 3:
print(f'Warning: num_batches must be >= 3, but got {num_batches=} otherwise you only get 1 comparison so 1 distance value')
# - Compute embeddings
embeddings, losses = [], []
for batch_num in range(num_batches):
print(f'--> {batch_num=}\n')
# - Get batch
shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=seed) if shuffle else dataset
raw_text_batch = shuffled_dataset.take(batch_size)
# raw_text_batch = shuffled_dataset.take(batch_size) if streaming else shuffled_dataset.select(range(batch_size))
tokenized_batch = map(raw_text_batch)
if verbose:
print(f'{raw_text_batch=}')
print(f'{tokenized_batch=}')
# time_start = time.time()
# print(f'{next(iter(raw_text_batch))=}')
# print(f'{next(iter(tokenized_batch))=}')
# print(f'Time it took: {time.time() - time_start} seconds \a\n')
# - Get Task2Vec embedding for batch
if not debug:
embedding, loss = Task2Vec(probe_network, classifier_opts={'seed': seed}).embed(tokenized_batch)
else:
embedding, loss = Task2Vec(probe_network, classifier_opts={'break_early': True, 'seed': seed}).embed(tokenized_batch, epochs=1) # only for debugging
print(f'{loss=}\n{embedding=}\n') if verbose else None
# - Collect results
embeddings.append(embedding)
losses.append(loss)
# - Compute diversity coefficient
distance_matrix = task_similarity.pdist(embeddings, distance=distance)
div_coeff, div_coeff_ci = task_similarity.stats_of_distance_matrix(distance_matrix)
# -- Return results
results: dict = {'div_coeff': div_coeff, 'div_coeff_ci': div_coeff_ci,
'embeddings': embeddings,
'distance_matrix': distance_matrix,
'losses': losses,
"num_batches": num_batches}
return results
def cross_diversity_coefficient(dataset_target,
dataset_source,
map_target: callable,
map_source: callable,
probe_network: nn.Module,
tokenizer = None,
batch_size: int = 512,
num_batches: int = 100,
seed = 0,
buffer_size: int = 500_000,
distance = 'cosine',
verbose: bool = False,
debug: bool = False,
shuffle: bool = True, # False for faster debugging/testing but it won't be shuffled
) -> dict:
""" """
# - Compute embedding of target
losses: list[dict] = []
embeddings: list[dict] = []
cross_distances = []
for batch_num in range(num_batches):
# - Get target shuffled data
shuffled_dataset = dataset_target.shuffle(buffer_size=buffer_size, seed=seed) if shuffle else dataset_target
raw_text_batch = shuffled_dataset.take(batch_size)
tokenized_batch = map_target(raw_text_batch)
# - Get Task2Vec embedding for batch
if not debug:
embedding_target, loss_target = Task2Vec(probe_network, classifier_opts={'seed': seed}).embed(tokenized_batch)
else:
embedding_target, loss_target = Task2Vec(probe_network, classifier_opts={'break_early': True, 'seed': seed}).embed(tokenized_batch, epochs=1) # only for debugging
print(f'{loss_target=}\n{embedding_target=}\n') if verbose else None
# - Get source shuffled data
shuffled_dataset = dataset_source.shuffle(buffer_size=buffer_size, seed=seed) if shuffle else dataset_source
raw_text_batch = shuffled_dataset.take(batch_size)
tokenized_batch = map_target(raw_text_batch)
# - Get Task2Vec embedding for batch
if not debug:
embedding_source, loss_source = Task2Vec(probe_network, classifier_opts={'seed': seed}).embed(tokenized_batch)
else:
embedding_source, loss_source = Task2Vec(probe_network, classifier_opts={'break_early': True, 'seed': seed}).embed(tokenized_batch, epochs=1) # only for debugging
print(f'{loss_source=}\n{embedding_source=}\n') if verbose else None
# - Append results to save later
embeddings.append({'embedding_target': embedding_target, 'embedding_source': embedding_source})
losses.append({'loss_target': loss_target, 'loss_source': loss_source})
# - Compute cross diversity coefficient
embeddings_target = [embedding['embedding_target'] for embedding in embeddings]
embeddings_source = [embedding['embedding_source'] for embedding in embeddings]
cross_distance_matrix = task_similarity.cross_pdist(embeddings_target, embeddings_source, distance=distance)
cross_div_coeff, cross_div_coeff_ci = task_similarity.stats_cross_distance_matrix(cross_distance_matrix)
# -- Return results
results: dict = {'cross_div_coeff': cross_div_coeff, 'cross_div_coeff_ci': cross_div_coeff_ci,
'embeddings': embeddings,
'distance_matrix': cross_div_coeff,
'losses': losses,
"num_batches": num_batches}
return results
def get_tokenized_batch(batch):
return next(iter(batch))
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def my_map(batch):
return batch.map(preprocess, batched=True, remove_columns=[])
def print_examples_from_dataset(dataset, preprocess=preprocess, map=my_map, batch_size=100):
print('\n---- Examples from dataset ----')
batch = dataset.take(batch_size)
# batch = map(batch)
counts = 0
for example in list(batch):
# print()
print(f'{example["text"]=}')
# print(f'{example.keys()=}')
# if 'url' in example:
# counts += 1
# # print(f'{example=}')
# else:
# print(f'{example=}')
# print(f'{counts=}')
# -- Tests, Examples
def test_get_batch_from_dataset():
from transformers import GPT2Tokenizer, GPT2LMHeadModel
max_seq_length = 128
batch_size = 512
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
# -- Get batch from dataset
# path, name = 'c4', 'en'
path, name = "wikitext", 'wikitext-103-v1'
dataset = load_dataset(path, name, streaming=True, split="train").with_format("torch")
batch = dataset.take(batch_size)
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=max_seq_length, truncation=True, return_tensors="pt")
print(f'{batch=}')
print(f'{next(iter(batch))=}')
remove_columns = ["text", "timestamp", "url"] if path == "c4" else []
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
print(f'{tokenized_batch=}')
print(f'{next(iter(tokenized_batch))=}')
# Running /lfs/ampere1/0/brando9/beyond-scale-language-data-diversity/src/diversity/div_coeff.py
# batch=<datasets.iterable_dataset.IterableDataset object at 0x7ff29031fd60>
# tokenized_batch=<datasets.iterable_dataset.IterableDataset object at 0x7ff2901889d0>
# Time it took: 0.9143445491790771 seconds
def alycias_original_colab_code():
""" https://colab.research.google.com/drive/1pL1JmE2LuRg5ClsZ7htFyzEAJ7iMEcys#scrollTo=aRI6a_27Tzd0 """
# -- Get probe network
from diversity.task2vec import Task2Vec
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get data set & data batch (as a data set obj)
dataset = load_dataset("c4", "en", streaming=True, split="train").with_format("torch")
remove_columns = ["text", "timestamp", "url"]
max_seq_length = 128
batch_size = 512
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=max_seq_length, truncation=True, return_tensors="pt")
batch = dataset.take(batch_size)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
# embedding, loss = Task2Vec(probe_network).embed(tokenized_batch)
# embedding, loss = Task2Vec(probe_network).embed(tokenized_batch, epochs=1)
from diversity.task_similarity import pdist, plot_distance_matrix, \
stats_of_distance_matrix
num_batches = 3
if num_batches < 2:
raise ValueError(f'num_batches must be >= 3, but got {num_batches=} otherwise you only get 1 comparison so 1 distance value')
buffer_size = 500_000
seed = 0
embeddings = []
for batch_num in range(num_batches):
print(f'--> {batch_num=}\n')
shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=seed)
batch = shuffled_dataset.take(batch_size)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
# embeddings.append(Task2Vec(probe_network).embed(tokenized_batch)[0])
# embeddings.append(Task2Vec(probe_network).embed(tokenized_batch, epochs=1)[0]) # only for debugging
embeddings.append(Task2Vec(probe_network, classifier_opts={'break_early': True}).embed(tokenized_batch, epochs=1)[0]) # only for debugging
print(f'{len(embeddings)=}')
distance_matrix = pdist(embeddings, distance='cosine')
print(f'{distance_matrix=}')
div_coeff, conf_interval = stats_of_distance_matrix(distance_matrix)
print(f'Diversity: {(div_coeff, conf_interval)=}')
plot_distance_matrix(embeddings)
import numpy as np
from pathlib import Path
output_dir = Path('./').expanduser()
np.save(output_dir / 'distance_matrix.npy', distance_matrix)
results: dict = {'embeddings': [embed for embed in embeddings],
'distance_matrix': distance_matrix,
"num_batches": num_batches}
np.save(output_dir / 'results.npy', results)
def test_diversity_coefficient():
print(f'Running function: {test_diversity_coefficient=}')
batch_size = 512
# -- Get probe network
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get data set
dataset = load_dataset("c4", "en", streaming=True, split="train").with_format("torch")
remove_columns = ["text", "timestamp", "url"]
print(f'{dataset=}')
batch = dataset.take(batch_size)
print(f'{next(iter(batch))=}')
# - Prepare functions to tokenize batch
time_start = time.time()
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
print(f'{next(iter(tokenized_batch))=}')
print(f'Time it took: {time.time() - time_start} seconds \a\n')
from torch.utils.data import Dataset, DataLoader
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
for i, d in enumerate(dataset):
assert isinstance(d, dict)
# dd = dataset[i]
# assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {}
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
num_workers=loader_opts.get('num_workers', 0), drop_last=False)
print(f'{next(iter(data_loader))=}')
# -- Compute diversity coefficient
# results: dict = get_diversity_coefficient(dataset, map, probe_network)
# results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=3) # only for debugging
results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=3, verbose=True, debug=True) # only for debugging
div_coeff, div_coeff_ci = results['div_coeff'], results['div_coeff_ci']
print(f'{div_coeff=} {div_coeff_ci=}')
# -- Save results or not
save_results = True
if save_results:
today = datetime.datetime.now().strftime('%Y-m%m-d%d-t%H_%M')
output_dir = Path(f'~/data/div_coeff/{today}').expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
np.save(output_dir / f'distance_matrix{today}.npy', results['distance_matrix'])
np.save(output_dir / f'results{today}.npy', results)
# Save results as a pretty-printed JSON
results = {key: str(value) for key, value in results.items()}
with open(output_dir / f'results{today}.json', 'w') as f:
json.dump(results, f, indent=4)
def test_interleaved_data_set_2_data_loader():
""" https://colab.research.google.com/drive/1QWDhA6Q64qijXYnwIGn63Aq9Eg5qt8tQ#scrollTo=Wjyy6QYimvIm """
remove_columns = []
# -- Get probe network
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get data set
from datasets import interleave_datasets
path, name = ['c4', 'wikitext'], ['en', 'wikitext-103-v1']
probabilities = [1.0/len(path)] * len(path)
batch_size = 512
datasets = [load_dataset(path, name, streaming=True, split="train").with_format("torch") for path, name in zip(path, name)]
[print(f'{dataset.description=}') for dataset in datasets]
dataset = interleave_datasets(datasets, probabilities)
print(f'{dataset=}')
batch = dataset.take(batch_size)
column_names = next(iter(batch)).keys()
print(f'{column_names=}')
# - Prepare functions to tokenize batch
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
remove_columns = column_names
print(f'{remove_columns=}')
def map(batch):
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
print(f'{next(iter(tokenized_batch))=}')
# -- Get data loader
from torch.utils.data import DataLoader, Dataset
data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False)
batch = next(iter(data_loader))
print(f'{batch=}')
# - test 2
batch = dataset.take(batch_size)
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):
return batch.map(preprocess, batched=True, remove_columns=[])
tokenized_batch = map(batch)
print(f'{next(iter(tokenized_batch))=}')
def collate_tokenize(data):
text_batch = [element["text"] for element in data]
tokenized = tokenizer(text_batch, padding='longest', max_length=128, truncation=True, return_tensors='pt')
return tokenized
data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False, collate_fn=collate_tokenize)
batch = next(iter(data_loader))
print(f'{batch=}')
print('Done!\a')
def cross_div_test():
""" """
batch_size = 512
# -- Get probe network
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get data set
path, name = 'c4', 'en'
dataset_target = load_dataset(path, name, streaming=True, split="train").with_format("torch")
raw_text_batch = dataset_target.take(batch_size)
print(f'{next(iter(raw_text_batch))=}')
path, name = "wikitext", 'wikitext-103-v1'
dataset_source = load_dataset(path, name, streaming=True, split="train").with_format("torch")
raw_text_batch = dataset_source.take(batch_size)
print(f'{next(iter(raw_text_batch))=}')
column_names = next(iter(raw_text_batch)).keys()
print(f'{column_names=}')
# - Prepare functions to tokenize batch
remove_columns = column_names # remove all text columns so tensors in default collate_fn don't crash
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(raw_text_batch)
print(f'{next(iter(tokenized_batch))=}')
# from torch.utils.data import Dataset, DataLoader
# dataset = tokenized_batch
# print(f'{type(dataset)=}')
# print(f'{dataset.__class__=}')
# print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
# assert isinstance(d, dict)
# # dd = dataset[i]
# # assert isinstance(dd, dict)
# loader_opts = {}
# classifier_opts = {}
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
# num_workers=loader_opts.get('num_workers', 0), drop_last=False)
# print(f'{next(iter(data_loader))=}')
# -- Compute diversity coefficient
results: dict = cross_diversity_coefficient(dataset_target, dataset_target, map, map, probe_network, num_batches=2, verbose=True, debug=True, shuffle=False) # only for debugging
cross_div_coeff, cross_div_coeff_ci = results['cross_div_coeff'], results['cross_div_coeff_ci']
print(f'{cross_div_coeff=} {cross_div_coeff_ci=}')
same_dataset_results = results
results: dict = cross_diversity_coefficient(dataset_target, dataset_source, map, map, probe_network, num_batches=2, verbose=True, debug=True, shuffle=False) # only for debugging
cross_div_coeff, cross_div_coeff_ci = results['cross_div_coeff'], results['cross_div_coeff_ci']
print(f'{cross_div_coeff=} {cross_div_coeff_ci=}')
different_dataset_results = results
print('Test: same data set cross div, so this value should be smaller than different')
print(f'{same_dataset_results=}')
print(f'{different_dataset_results=}')
# -- Save results or not
save_results = True
if save_results:
today = datetime.datetime.now().strftime('%Y-m%m-d%d-t%H_%M')
output_dir = Path(f'~/data/div_coeff/{today}').expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
np.save(output_dir / f'distance_matrix{today}.npy', results['distance_matrix'])
np.save(output_dir / f'results{today}.npy', results)
# Save results as a pretty-printed JSON
results = {key: str(value) for key, value in results.items()}
with open(output_dir / f'results{today}.json', 'w') as f:
json.dump(results, f, indent=4)
def test_eos_pad():
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
raw_text_batch = 'a'
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# print(f'{tokenizer.eos_token=}')
# print(f'{tokenizer.eos_token_id=}')
# print(f'{tokenizer.pad_token=}')
# print(f'{tokenizer.pad_token_id=}')
# print(f'{raw_text_batch=}')
# tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
# print(f'{tokenize_batch=}')
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
print(f'{tokenizer.eos_token=}')
print(f'{tokenizer.eos_token_id=}')
print(f'{tokenizer.pad_token=}')
print(f'{tokenizer.pad_token_id=}')
print(f'{raw_text_batch=}')
tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
print(f'{tokenize_batch=}')
print('Done')
# -- Experiments
def experiment_compute_diveristy_coeff_single_dataset_then_combined_datasets_with_domain_weights():
"""
Get divs using pt ft, pt (rand, rand ft?)
- div c4
- div wt = wt-103
Then with unioned datasets
- div c4+wt, uniform [0.5, 0.5]
- # div c4+wt, data set size proportions (using GBs)
- div c4+wt, respect doremi
- div c4+wt, respect the pile
- div c4+wt, respect gpt3 weights
then repeat all with pt (no ft)
"""
import random
from diversity.data_mixtures import get_uniform_data_mixture_for_c4_wt103, get_doremi_based_data_mixture_for_c4_wt103, get_llama_v1_based_data_mixture_for_c4_wt103
probabilities = []
data_mixture_name = None
streaming = True
data_files = [None]
seed = 0
split = 'train'
# token = open(Path('~/data/hf_token.txt').expanduser()).read().strip() # put to load_dataset( ..., token=token)
# -- Setup wandb
import wandb
# - Dryrun
# mode = 'dryrun'; num_batches = 3
mode = 'dryrun'; num_batches = 3; seed = random.randint(0, 2**32 - 1)
# - Online (real experiment)
# mode='online'; num_batches = 600; seed = random.randint(0, 2**32 - 1)
# - c4 wt single
path, name = 'c4', 'en'
# path, name = "wikitext", 'wikitext-103-v1'
# path, name = 'Skylion007/openwebtext', None
# path, name = 'togethercomputer/RedPajama-Data-1T', 'default' # https://github.com/togethercomputer/RedPajama-Data/issues/70, https://github.com/togethercomputer/RedPajama-Data
# https://huggingface.co/datasets/cerebras/SlimPajama-627B
# - c4 wt mix
# path, name, data_files = ['c4', 'wikitext'], ['en', 'wikitext-103-v1'], [None, None]
# probabilities, data_mixture_name = get_uniform_data_mixture_for_c4_wt103()
# probabilities, data_mixture_name = get_doremi_based_data_mixture_for_c4_wt103()
# probabilities, data_mixture_name = get_llama_v1_based_data_mixture_for_c4_wt103()
# probabilities, data_mixture_name = [0.75, 0.25], '[0.75, 0.25]'
# probabilities, data_mixture_name = [0.25, 0.75], '[0.25, 0.75]'
# - pile, pile cc single
# path, name = 'EleutherAI/pile', 'all'
# path, name = 'conceptofmind/pile_cc', 'sep_ds'
# - 5 subsets of pile using hf data set viewer (parquet))
# from diversity.pile_subset_urls import urls_hacker_news, urls_nih_exporter, urls_pubmed, urls_uspto
# path, name, data_files = 'conceptofmind/pile_cc', 'sep_ds', [None]
# path, name, data_files, split = 'suolyer/pile_pile-cc', None, [None], 'validation' # https://huggingface.co/datasets/suolyer/pile_pile-cc
# path, name, data_files, split = 'brando/pile_cc', None, [None], 'validation' # https://huggingface.co/datasets/brando/pile_cc/tree/main
# path, name, data_files = 'parquet', 'hacker_news', urls_hacker_news
# path, name, data_files = 'parquet', 'nih_exporter', urls_nih_exporter
# path, name, data_files = 'parquet', 'pubmed', urls_pubmed
# path, name, data_files = 'parquet', 'uspto', urls_uspto
# - 5 subsets of the pile interleaved
# from diversity.pile_subset_urls import urls_hacker_news, urls_nih_exporter, urls_pubmed, urls_uspto
# from diversity.data_mixtures import get_uniform_data_mixture_5subsets_of_pile, get_doremi_data_mixture_5subsets_of_pile, get_llama_v1_data_mixtures_5subsets_of_pile
# path, name, data_files, split = ['suolyer/pile_pile-cc'] + ['parquet'] * 4, [None] + ['hacker_news', 'nih_exporter', 'pubmed', 'uspto'], [None] + [urls_hacker_news, urls_nih_exporter, urls_pubmed, urls_uspto], ['validation'] + ['train'] * 4
# ## path, name, data_files = ['conceptofmind/pile_cc'] + ['parquet'] * 4, ['sep_ds'] + ['hacker_news', 'nih_exporter', 'pubmed', 'uspto'], [None] + [urls_hacker_news, urls_nih_exporter, urls_pubmed, urls_uspto]
# # probabilities, data_mixture_name = get_uniform_data_mixture_5subsets_of_pile()
# # probabilities, data_mixture_name = get_llama_v1_data_mixtures_5subsets_of_pile(name)
# probabilities, data_mixture_name = get_doremi_data_mixture_5subsets_of_pile(name)
# - not changing
batch_size = 512
today = datetime.datetime.now().strftime('%Y-m%m-d%d-t%Hh_%Mm_%Ss')
run_name = f'{path} div_coeff_{num_batches=} ({today=} ({name=}) {data_mixture_name=} {probabilities=})'
print(f'\n---> {run_name=}\n')
# - Init wandb
debug: bool = mode == 'dryrun'
run = wandb.init(mode=mode, project="beyond-scale", name=run_name, save_code=True)
wandb.config.update({"num_batches": num_batches, "path": path, "name": name, "today": today, 'probabilities': probabilities, 'batch_size': batch_size, 'debug': debug, 'data_mixture_name': data_mixture_name, 'streaming': streaming, 'data_files': data_files, 'seed': seed})
# run.notify_on_failure() # https://community.wandb.ai/t/how-do-i-set-the-wandb-alert-programatically-for-my-current-run/4891
print(f'{debug=}')
print(f'{wandb.config=}')
# -- Get probe network
from datasets import load_dataset
from datasets.iterable_dataset import IterableDataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# if tokenizer.pad_token_id is None:
# tokenizer.pad_token = tokenizer.eos_token
# probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
# device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
# probe_network = probe_network.to(device)
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
torch_dtype = torch.bfloat16
torch_dtype = torch.float32
pretrained_model_name_or_path = 'meta-llama/Llama-2-7b-hf'
bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8, # if >= 8 ==> brain float 16 available or set to True if you always want fp32
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path,
# quantization_config=quantization_config,
# device_map=device_map, # device_map = None https://github.com/huggingface/trl/blob/01c4a35928f41ba25b1d0032a085519b8065c843/examples/scripts/sft_trainer.py#L82
trust_remote_code=True,
torch_dtype=torch_dtype,
use_auth_token=True,
)
print(f'{pretrained_model_name_or_path=}')
# # https://github.com/artidoro/qlora/blob/7f4e95a68dc076bea9b3a413d2b512eca6d004e5/qlora.py#L347C13-L347C13
# tokenizer = AutoTokenizer.from_pretrained(
# pretrained_model_name_or_path,
# # cache_dir=args.cache_dir,
# padding_side="right",
# use_fast=False, # Fast tokenizer giving issues.
# # tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, # Needed for HF name change
# # tokenizer_type='llama',
# trust_remote_code=True,
# use_auth_token=True,
# )
# from transformers import AutoTokenizer
# import transformers
# import torch
# model = "meta-llama/Llama-2-7b-chat-hf"
# tokenizer = AutoTokenizer.from_pretrained(model)
# tokenizer = AutoTokenizer.from_pretrained(model, torch_dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained(model, torch_dtype=torch_dtype, use_auth_token=True)
# pipeline = transformers.pipeline(
# "text-generation",
# model=model,
# torch_dtype=torch.float16,
# device_map="auto",
# )
# -- Get data set
def my_load_dataset(path, name, data_files=data_files, split=split):
print(f'{path=} {name=} {streaming=} {data_files=}, {split=}')
if path == 'json' or path == 'bin' or path == 'csv':
print(f'{data_files_prefix+name=}')
return load_dataset(path, data_files=data_files_prefix+name, streaming=streaming, split=split).with_format("torch")
elif path == 'parquet':
print(f'{data_files=}')
return load_dataset(path, data_files=data_files, streaming=streaming, split=split).with_format("torch")
if 'pile_cc' in path or 'pile-cc' in path:
return load_dataset(path, name, streaming=streaming, split=split).with_format("torch")
else:
return load_dataset(path, name, streaming=streaming, split=split).with_format("torch")
# - get data set for real now
if isinstance(path, str):
dataset = my_load_dataset(path, name, data_files)
else:
# -Interleaving datasets
print('- Interleaving datasets')
datasets = [my_load_dataset(path, name, data_files, split=split).with_format("torch") for path, name, data_files, split in zip(path, name, data_files, split)]
# datasets = [my_load_dataset(path, name).with_format("torch") for path, name in zip(path, name)]
if any('parquet' == p for p in path) or path == 'parquest': # idk why I need to do this, I checked very carefully and deleted all columns so interleaved data set matched but when doing this with c4 & wikitext it fails but with the parquet it works https://discuss.huggingface.co/t/why-does-deleting-the-columns-before-giving-it-to-interleave-work-but-sometimes-it-does-not-work/50879
dataset_descriptions = [dataset.description for dataset in datasets] # print description if available
print(f'{dataset_descriptions=}')
# - make sure all datasets have the same columns to avoid interleave to complain
# all_columns = [col for dataset in datasets for col in dataset.column_names]
# hack due to inconsistent datasets
all_columns = []
for dataset in datasets:
if dataset.column_names is None:
all_columns.extend(['text', 'meta'])
assert 'pile_cc' in dataset._info.dataset_name or 'pile-cc' in dataset._info.dataset_name, f'Err: {dataset._info.dataset_name=} {dataset.column_names=} {dataset._info.features=} {dataset._info.splits=} {dataset._info.description=}'
continue
for col in dataset.column_names:
# if dataset.column_names is None:
# all_columns.append(['text', 'meta'])
# assert 'pile_cc' in dataset._info.dataset_name or 'pile-cc' in dataset._info.dataset_name, f'Err: {dataset._info.dataset_name=} {dataset.column_names=} {dataset._info.features=} {dataset._info.splits=} {dataset._info.description=}'
# else:
all_columns.append(col)
print(f'{all_columns=}')
# columns_to_remove = [col for dataset in datasets for col in dataset.column_names if col != 'text']
columns_to_remove = [col for col in all_columns if col != 'text']
columns_to_remove = list(set(columns_to_remove)) # remove duplicates
print(f'{columns_to_remove=}')
datasets = [dataset.remove_columns(columns_to_remove) for dataset in datasets]
# - interleave
print(f'{probabilities=}')
dataset_descriptions = [dataset.description for dataset in datasets] # print description if available
print(f'{dataset_descriptions=}')
dataset = interleave_datasets(datasets, probabilities)
# dataset = dataset.remove_columns(columns_to_remove)
print(f'{dataset=}')
print(f'{dataset.column_names=}')
print(f'{dataset=}')
print(f'{type(dataset)=}')
print(f'{split=}')
# datasets.iterable_dataset.IterableDataset
# datasets.arrow_dataset.Dataset
# dataset = IterableDataset(dataset) if type(dataset) != IterableDataset else dataset # to force dataset.take(batch_size) to work in non-streaming mode
raw_text_batch = dataset.take(batch_size) if streaming else dataset.select(range(batch_size))
print(f'{raw_text_batch=}')
print(f'{next(iter(raw_text_batch))=}')
column_names = next(iter(raw_text_batch)).keys()
print(f'{column_names=}')
# - Prepare functions to tokenize batch
def preprocess(examples):
return tokenizer(examples["text"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
remove_columns = column_names # remove all keys that are not tensors to avoid bugs in collate function in task2vec's pytorch data loader
def map(batch):
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(raw_text_batch)
print(f'{next(iter(tokenized_batch))=}')
# -- Compute diversity coefficient
print(f'-- Compute diversity coefficient')
print(f'{seed=}, {streaming=}')
# - Debug run
# results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=3, seed=seed, debug=True, shuffle=False) # (quick debug) hardcoded for debugging
# results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=3, seed=seed, debug=True, shuffle=True) # (slow debug) hardcoded for debugging
# results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=3, seed=seed, debug=False, shuffle=False) # (real) hardcoded for debugging
# - Real run
# assert not debug, f'Err: {debug=} for real run'
results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=num_batches, seed=seed, debug=debug, shuffle=False)
# results: dict = get_diversity_coefficient(dataset, map, probe_network, num_batches=num_batches, seed=seed, debug=debug, shuffle=True)
# - Log results
div_coeff, div_coeff_ci = results['div_coeff'], results['div_coeff_ci']
print(f'{div_coeff=} {div_coeff_ci=}')
wandb.log({'div_coeff': div_coeff, 'div_coeff_ci': div_coeff_ci})
# -- Save results or not
save_results = True
if save_results:
output_dir = Path(f'~/data/div_coeff/{today}').expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
np.save(output_dir / f'distance_matrix{today}.npy', results['distance_matrix'])
np.save(output_dir / f'results{today}.npy', results)
# Save results as a pretty-printed JSON
results = {key: str(value) for key, value in results.items()}
with open(output_dir / f'results{today}.json', 'w') as f:
json.dump(results, f, indent=4)
# - wandb save
base_path = str(output_dir.parent)
wandb.save(str(output_dir / f'distance_matrix{today}.npy'), base_path=base_path)
wandb.save(str(output_dir / f'results{today}.npy'), base_path=base_path)
wandb.save(str(output_dir / f'results{today}.json'), base_path=base_path)
wandb.save(__file__)
# -- Finish wandb
wandb.finish()
if __name__ == '__main__':
print(f'\n\n\n------------------- Running {__file__} -------------------')
# -- Run tests and time it
import time
time_start = time.time()
# -- Run tests
# test_get_batch_from_dataset()
# alycias_original_colab_code()
# test_diversity_coefficient()
# cross_div_test()
# test_interleaved_data_set_2_data_loader()
test_eos_pad()
experiment_compute_diveristy_coeff_single_dataset_then_combined_datasets_with_domain_weights()
# -- End tests, report how long it took in seconds, minutes, hours, days
print(f'Time it took to run {__file__}: {time.time() - time_start} seconds, {(time.time() - time_start)/60} minutes, {(time.time() - time_start)/60/60} hours, {(time.time() - time_start)/60/60/24} days\a')