In [4]:
import json
import argparse
import os
import glob

from typing import List, Tuple, Set, DefaultDict
from nltk import ngrams
from typing import Dict
from tqdm import tqdm
from dataclasses import dataclass
from collections import defaultdict

from light_scenario import LightInstance, LightScenario, LightScenarioKey
from data_overlap_spec import DataOverlapStats, DataOverlapStatsKey, OverlapProtocolSpec
from light_tokenizer import LightTokenizer, DefaultTokenizer
from load_documents import get_document_iterator
from common.hierarchical_logger import hlog, htrack_block
from common.general import asdict_without_nones
from scenarios.scenario import ScenarioSpec


# The n values of the ngrams to be computed
N_VALUES: List[int] = [5, 9, 13]  # TODO: Pick the N values

PART_INPUT: str = "input"
PART_REF: str = "reference"


@dataclass(frozen=True)
class EntryDataOverlapKey:
    """Unique key representing either the input or references of a single instance in a scenario."""

    stats_key: DataOverlapStatsKey
    part: str
    """Either PART_INPUT or PART_REF"""
    instance_id: str


# type alias for overlap-related data structures
Ngram = Tuple[str, ...]
NgramIndex = Dict[int, Dict[Ngram, Set[EntryDataOverlapKey]]]
NgramCounter = Dict[EntryDataOverlapKey, Dict[Ngram, int]]


def load_light_scenarios_from_jsonl(path: str) -> List[LightScenario]:
    """
    Create a list of light scenarios from a jsonl file, where each json represents a LightScenario object.

    Input file format:

    Instance JSON 1
    Instance JSON 2
    Instance JSON 3
    ...
    """

    def create_light_instance_from_dict(instance_dict: dict) -> LightInstance:
        return LightInstance(
            input=instance_dict[PART_INPUT], references=instance_dict["references"], id=instance_dict["id"]
        )

    light_scenarios: List[LightScenario] = []
    light_scenario_jsons = open(path, "r").readlines()
    for light_scenario_json in light_scenario_jsons:
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        # if the light_scenarios are exported from helm, they will have a scenario_spec field
        scenario_spec = ScenarioSpec(**light_scenario_key_dict["scenario_spec"])
        light_scenario_key = LightScenarioKey(scenario_spec=scenario_spec, split=light_scenario_key_dict["split"])
        light_instances: List[LightInstance] = [
            create_light_instance_from_dict(instance_dict) for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append(LightScenario(scenario_key=light_scenario_key, instances=light_instances))
    return light_scenarios




In [6]:
infile_name = 'scenario_data_new'
outfile_name = infile_name + '_keys_scenario3'
light_scenarios = load_light_scenarios_from_jsonl(infile_name)
scenario_keys = []
for light_scenario in light_scenarios:
    scenario_keys.append(light_scenario.scenario_key)
from common.general import asdict_without_nones

In [8]:
print(scenario_keys[0])
print(scenario_keys[0].split)

LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'sociology'}), split='train')
train


In [10]:
with open(outfile_name, "w") as f:
    f.writelines(f"{json.dumps(asdict_without_nones(scenario_key.scenario_spec))}; {scenario_key.split}\n" for scenario_key in scenario_keys)