In [1]:
from benchmark import HFDatasets, show, Benchmark

# benchmark 모듈 사용 예제

+ `benchmark.py`
    + `Benchmark`: Benchmark config를 불러오고, 샘플링, 저장등의 작업을 한다
    + `HFDatasets`: `hfdatasets.yaml`에 정의된 벤치마크 데이터셋을 `datasets.load_dataset`로 불러오고, 폴더 트리 등을 만든다

In [20]:
from benchmark import HFDatasets, Benchmark, show
from textwrap import shorten

## HFDataset

In [8]:
hf_conf = HFDatasets() # 인스턴스 생성

In [23]:
shorten(str(help(hf_conf)), 10)

Help on HFDatasets in module benchmark object:

class HFDatasets(builtins.object)
 |  HFDatasets(path: str = 'hfdatasets.yaml')
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path: str = 'hfdatasets.yaml')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  get_all_names(self)
 |      yaml내 벤치마크들의 전체 이름 출력
 |  
 |  get_all_values(self, key: str)
 |      벤치마크들의 key로 전달된 값의 value들만 출력
 |  
 |  get_benchmark(self, name: str)
 |      벤치마크의 전체 정보 출력
 |  
 |  make_folder_tree(self, key: Union[list, str] = None, overwrite: bool = False)
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  all_names
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references

'None'

In [14]:
hf_conf.get_all_names() # 모든 벤치마크의 이름을 호출

['mmlu',
 'mmlu_pro',
 'agi_eval',
 'commonsense_qa',
 'winogrande',
 'bigbenchhard',
 'arc',
 'trivia_qa']

In [16]:
hf_conf.get_all_values("url") # 모든 벤치마크의 특정 키만 호출

{'mmlu': 'https://huggingface.co/datasets/cais/mmlu',
 'mmlu_pro': 'https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro',
 'agi_eval': 'https://huggingface.co/hails',
 'commonsense_qa': 'https://huggingface.co/datasets/tau/commonsense_qa',
 'winogrande': 'https://huggingface.co/datasets/allenai/winogrande',
 'bigbenchhard': 'https://huggingface.co/datasets/maveriq/bigbenchhard',
 'arc': 'https://huggingface.co/datasets/allenai/ai2_arc',
 'trivia_qa': 'https://huggingface.co/datasets/mandarjoshi/trivia_qa'}

In [18]:
hf_conf.get_benchmark("mmlu") # 특정 벤치마크의 모든 정보 출력

{'path': 'cais/mmlu',
 'name': 'all',
 'url': 'https://huggingface.co/datasets/cais/mmlu',
 'paper': 'https://arxiv.org/pdf/2009.03300',
 'annotation': None}

In [27]:
shorten(str(hf_conf.config), 200) # 모든 벤치마크 정보 출력

"{'mmlu': {'path': 'cais/mmlu', 'name': 'all', 'url': 'https://huggingface.co/datasets/cais/mmlu', 'paper': 'https://arxiv.org/pdf/2009.03300', 'annotation': None}, 'mmlu_pro': {'path': 'TIGER- [...]"

In [29]:
show(hf_conf.get_benchmark("mmlu")) # show를 쓰면 indent와 colored를 처리해줌

[38;5;249m{[39m
[38;5;249m    [39m[38;5;168m"path"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"cais/mmlu"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"name"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"all"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"url"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"https://huggingface.co/datasets/cais/mmlu"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"paper"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"https://arxiv.org/pdf/2009.03300"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"annotation"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;180mnull[39m
[38;5;249m}[39m



In [31]:
# tasks/<benchmark_name> 폴더를 생성하고
# tasks/<benchmark_name>/README.md 폴더를 template.py의 README_TEMPLATE 기반으로 생성해준다
hf_conf.make_folder_tree(overwrite = True)

[32m2024-09-07 20:44:54.361[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/mmlu/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.362[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/mmlu_pro/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.364[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/agi_eval/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.365[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/commonsense_qa/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.366[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/winogrande/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.368[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mmake_folder_tree[0m:[36m245[0m - [1mtasks/bigbenchhard/README.md을 초기화합니다[0m
[32m2024-09-07 20:44:54.369[0m | [1mINFO    [0m | [36mben

## Benchmark

In [33]:
help(Benchmark)

Help on class Benchmark in module benchmark:

class Benchmark(builtins.object)
 |  Benchmark(benchmark_name: str, path: str = None, name: str = None, num_proc: int = 6, dataset=None, dataset_option: dict = None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, benchmark_name: str, path: str = None, name: str = None, num_proc: int = 6, dataset=None, dataset_option: dict = None)
 |      Benchmark를 EDA하는 클래스
 |      
 |      Args:
 |          path: dataset의 path(hf)
 |          name: config로 전달되는 path값, ARC-Challenge, ARC-easy, logiqa-en ... 등
 |          num_proc: 작업 프로세스 수
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  sample(self, split: str = None, dataset=None, category: str = None)
 |      dataset에서 sampling하기, 카테고리가 있으면 카테고리마다 1개씩 샘플링함
 |      Args:
 |          dataset: 샘플링할 데이터셋
 |          category: 구분하려는 카테고리
 |  
 |  save(self, split: str = None, samples: list = None, path: str = None, category: str = None)
 |      samples를 저장함
 |      Args:
 |          split

In [32]:
benchmark = Benchmark("mmlu", name = "all") # benchmark 인스턴스 생성

Using the latest cached version of the dataset since cais/mmlu couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'all' at /Users/user/.cache/huggingface/datasets/cais___mmlu/all/0.0.0/c30699e8356da336a370243923dbaf21066bb9fe (last modified on Tue Sep  3 10:42:30 2024).
[32m2024-09-07 20:45:51.751[0m | [1mINFO    [0m | [36mbenchmark[0m:[36m__init__[0m:[36m74[0m - [1m
path: cais/mmlu
name: all
total_split: ['test', 'validation', 'dev', 'auxiliary_train']
prior_split: test
test: 14042
validation: 1531
dev: 285
auxiliary_train: 99842
[0m


In [35]:
benchmark.show() # 샘플 데이터를 보여준다

[38;5;249m{[39m
[38;5;249m    [39m[38;5;168m"question"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"This question refers to the following information.\n\"As a means of effecting this end I suggest for your consideration the propriety of setting apart an ample district west of the Mississippi, and without the limit of any State or Territory now formed, to be guaranteed to the Indian tribes as long as they shall occupy it. . . . There they may be secured in the enjoyment of governments of their own choice, subject to no other control from the United States than such as may be necessary to preserve peace on the frontier and between the several tribes. There the benevolent may endeavor to teach them the arts of civilization. . . .\n\"This emigration would be voluntary, for it would be as cruel and unjust to compel the aborigines to abandon the graves of their fathers and seek a home in a distant land. But they should be distinctly informed that if they remain within the limits of

In [39]:
benchmark.sample()[0] # 데이터 샘플링, 카테고리 미지정시 20개를 샘플링한다. show()는 sample()을 기반으로 수행된다

{'question': 'A 31-year-old woman, gravida 3, para 2, who is at 32 weeks\' gestation, is admitted to the hospital because of a 1-week history of progressive dyspnea and wheezing. She says her heart is "racing" and she is coughing up a small amount of bloodstreaked sputum. Medical history is significant for hypothyroidism, for which she takes levothyroxine. An intravenous catheter is placed. Vital signs are temperature 36.9°C (98.4°F), pulse 132/min, respirations 32/min, and blood pressure 135/78 mm Hg. Pulse oximetry on 100% oxygen via nasal cannula shows an oxygen saturation of 92%. Auscultation of the lungs discloses decreased breath sounds at the bases with expiratory crackles bilaterally. Cardiac examination discloses an irregularly irregular rhythm, an indistinct point of maximal impulse, and a loud S1. A grade 3/6, low-pitched, diastolic, rumbling murmur is audible at the apex; a distinct snapping sound precedes the murmur. Fetal heart rate is 144/min. Which of the following is t

In [42]:
benchmark.sample(category="subject")[0] # category마다 샘플링하려면 argument를 전달해준다

{'question': 'Statement 1 | Every integral domain with characteristic 0 is infinite. Statement 2 | Every integral domain with prime characteristic is finite.',
 'subject': 'abstract_algebra',
 'choices': ['True, True', 'False, False', 'True, False', 'False, True'],
 'answer': 1}

In [43]:
benchmark.save(category="subject") # 샘플링 결과를 저장한다

[32m2024-09-07 20:49:43.514[0m | [1mINFO    [0m | [36mbenchmark[0m:[36msample[0m:[36m153[0m - [1m카테고리가 있어서 카테고리마다 3개씩 샘플링합니다[0m
[32m2024-09-07 20:49:43.612[0m | [1mINFO    [0m | [36mbenchmark[0m:[36msample[0m:[36m176[0m - [1m총 171개의 샘플이 있습니다.[0m
[32m2024-09-07 20:49:43.612[0m | [1mINFO    [0m | [36mbenchmark[0m:[36msave[0m:[36m131[0m - [1m./tasks/mmlu/mmlu-all.json에 저장합니다.[0m
[32m2024-09-07 20:49:43.612[0m | [1mINFO    [0m | [36mbenchmark[0m:[36msave[0m:[36m132[0m - [1m171의 샘플이 저장됩니다.[0m


## GithubReader
+ jsonl 읽어오기

In [5]:
from benchmark import GithubReader

github = GithubReader(user="rowanz", repo="hellaswag")

In [12]:
github.show("data/hellaswag_train.jsonl")

[32m2024-09-13 14:35:00.297[0m | [1mINFO    [0m | [36mbenchmark[0m:[36mget_jsonl[0m:[36m300[0m - [1mdata/hellaswag-train.jsonl will be overwritten by data/hellaswag_train.jsonl[0m


[38;5;249m{[39m
[38;5;249m    [39m[38;5;168m"ind"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;173m3176[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"activity_label"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"Capoeira"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"ctx_a"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"A group of men are holding maracas in the their hands and playing to some reggae music."[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"ctx_b"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"one of the men"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"ctx"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"A group of men are holding maracas in the their hands and playing to some reggae music. one of the men"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"split"[39m[38;5;249m:[39m[38;5;249m [39m[38;5;108m"train"[39m[38;5;249m,[39m
[38;5;249m    [39m[38;5;168m"split_type"[39m[38;5;249m:[39m[38;5;249m [39