In [14]:
import json
import argparse
import os
import glob

from typing import List, Tuple, Set, Any, Optional
from nltk import ngrams
from typing import Dict
from tqdm import tqdm
from dataclasses import dataclass

from light_scenario import LightInstance, LightScenario, LightScenarioKey
from light_tokenizer import LightTokenizer, DefaultTokenizer
from load_documents import get_document_iterator
from data_overlap_stats import (
    DataOverlapStats,
    DataOverlapStatsKey,
    PART_INPUT,
    PART_REF,
)
from common.hierarchical_logger import hlog, htrack_block
from common.general import asdict_without_nones, write
from scenarios.scenario import ScenarioSpec


# The n values of the ngrams to be computed
N_VALUES: List[int] = [5, 9, 13]  # TODO: Pick the N values


@dataclass(frozen=True)
class EntryDataOverlapKey:
    """Unique key representing either the input or references of a single instance in a scenario."""

    stats_key: DataOverlapStatsKey
    part: str
    """Either PART_INPUT or PART_REF"""
    instance_id: int


# type alias for overlap-related data structures
Ngram = Tuple[str, ...]
NgramIndex = Dict[int, Dict[Ngram, Set[EntryDataOverlapKey]]]
AllDataOverlapStats = Dict[DataOverlapStatsKey, DataOverlapStats]
NgramCounter = Dict[EntryDataOverlapKey, Dict[Ngram, int]]


def load_light_scenarios_from_jsonl(path: str) -> List[LightScenario]:
    """
    Create a list of light scenarios from a jsonl file, where each json represents a LightScenario object.

    Input file format:

    Instance JSON 1
    Instance JSON 2
    Instance JSON 3
    ...

    Each line is a json and each json looks like:
    {
        "light_scenario_key": {
            "metadata":{
                "split": "SPLIT",
                "scenario_attribute_1": "ATTRIBUTE1",
                "scenario_attribute_2": "ATTRIBUTE2",
            }
        },
        "light_instances": [
            {
                "input": "INPUT_TEXT1",
                "references": [
                    "REFERENCE_TEXT_1",
                    "REFERENCE_TEXT_2"
                ]
            },
            {
                "input": "INPUT_TEXT2",
                "references": [
                    "REFERENCE_TEXT_3",
                    "REFERENCE_TEXT_4"
                ]
            }
        ]
    }

    Note that the values of light_scenario_key.metadata need to be hashable.
    """

    def create_light_instance_from_dict(instance_dict: dict) -> LightInstance:
        return LightInstance(input=instance_dict["input"], references=instance_dict["references"])

    light_scenarios: List[LightScenario] = []
    light_scenario_jsons = open(path, "r").readlines()
    for light_scenario_json in light_scenario_jsons:
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_metadata: dict = light_scenario_dict["light_scenario_key"]["metadata"]
        # if the light_scenarios are exported from helm, they will have a scenario_spec field
        if "scenario_spec" in light_scenario_metadata:
            light_scenario_metadata["scenario_spec"] = ScenarioSpec(**light_scenario_metadata["scenario_spec"])
        light_scenario_key = LightScenarioKey(metadata=light_scenario_metadata)
        light_instances: List[LightInstance] = [
            create_light_instance_from_dict(instance_dict) for instance_dict in light_scenario_dict["light_instances"]
        ]
        light_scenarios.append(LightScenario(light_scenario_key=light_scenario_key, light_instances=light_instances))
    return light_scenarios
light_scenarios = load_light_scenarios_from_jsonl('05_30_23_scenario_data.jsonl')
scenario_keys = []
for light_scenario in light_scenarios:
    scenario_keys.append(light_scenario.light_scenario_key)
# from common.general import asdict_without_nones
# with open('05_30_23_scenario_data.jsonl_keys2', "w") as f:
#     f.writelines(f"{json.dumps(asdict_without_nones(scenario_key))}\n" for scenario_key in scenario_keys)

In [21]:
print(scenario_keys[0].metadata)
print(scenario_keys[0].metadata['scenario_spec'])
scenario_keys[0].metadata['split']

{'scenario_spec': ScenarioSpec(class_name='helm.benchmark.scenarios.math_scenario.MATHScenario', args={'subject': 'algebra', 'level': 3, 'use_official_examples': True, 'use_chain_of_thought': False}), 'split': 'test'}
ScenarioSpec(class_name='helm.benchmark.scenarios.math_scenario.MATHScenario', args={'subject': 'algebra', 'level': 3, 'use_official_examples': True, 'use_chain_of_thought': False})


'test'

In [24]:
scenario_keys[0].metadata['scenario_spec']
from common.general import asdict_without_nones
with open('05_30_23_scenario_data.jsonl_keys3_scenario_spec_plus_split', "w") as f:
    for scenario_key in scenario_keys:
        f.write(f"{json.dumps(asdict_without_nones(scenario_key.metadata['scenario_spec']))}; {scenario_key.metadata['split']}\n")

In [3]:
light_scenarios = load_light_scenarios_from_jsonl('scenario_data_small')

In [3]:
light_scenarios = load_light_scenarios_from_jsonl('scenario_data_small')

In [10]:
scenario_keys = []
for light_scenario in light_scenarios:
    scenario_keys.append(light_scenario.light_scenario_key)

In [13]:
from common.general import asdict_without_nones
with open('testout', "w") as f:
    f.writelines(f"{json.dumps(asdict_without_nones(scenario_key))}\n" for scenario_key in scenario_keys)

In [15]:
import json
import argparse
import os
import glob

from typing import List, Tuple, Set, Any, Optional
from nltk import ngrams
from typing import Dict
from tqdm import tqdm
from dataclasses import dataclass

from light_scenario import LightInstance, LightScenario, LightScenarioKey
from light_tokenizer import LightTokenizer, DefaultTokenizer
from load_documents import get_document_iterator
from data_overlap_stats import (
    DataOverlapStats,
    DataOverlapStatsKey,
    PART_INPUT,
    PART_REF,
)
from common.hierarchical_logger import hlog, htrack_block
from common.general import asdict_without_nones, write
from scenarios.scenario import ScenarioSpec


# The n values of the ngrams to be computed
N_VALUES: List[int] = [5, 9, 13]  # TODO: Pick the N values


@dataclass(frozen=True)
class EntryDataOverlapKey:
    """Unique key representing either the input or references of a single instance in a scenario."""

    stats_key: DataOverlapStatsKey
    part: str
    """Either PART_INPUT or PART_REF"""
    instance_id: int


# type alias for overlap-related data structures
Ngram = Tuple[str, ...]
NgramIndex = Dict[int, Dict[Ngram, Set[EntryDataOverlapKey]]]
AllDataOverlapStats = Dict[DataOverlapStatsKey, DataOverlapStats]
NgramCounter = Dict[EntryDataOverlapKey, Dict[Ngram, int]]


def load_light_scenarios_from_jsonl(path: str) -> List[LightScenario]:
    """
    Create a list of light scenarios from a jsonl file, where each json represents a LightScenario object.

    Input file format:

    Instance JSON 1
    Instance JSON 2
    Instance JSON 3
    ...

    Each line is a json and each json looks like:
    {
        "light_scenario_key": {
            "metadata":{
                "split": "SPLIT",
                "scenario_attribute_1": "ATTRIBUTE1",
                "scenario_attribute_2": "ATTRIBUTE2",
            }
        },
        "light_instances": [
            {
                "input": "INPUT_TEXT1",
                "references": [
                    "REFERENCE_TEXT_1",
                    "REFERENCE_TEXT_2"
                ]
            },
            {
                "input": "INPUT_TEXT2",
                "references": [
                    "REFERENCE_TEXT_3",
                    "REFERENCE_TEXT_4"
                ]
            }
        ]
    }

    Note that the values of light_scenario_key.metadata need to be hashable.
    """

    def create_light_instance_from_dict(instance_dict: dict) -> LightInstance:
        return LightInstance(input=instance_dict["input"], references=instance_dict["references"])

    light_scenarios: List[LightScenario] = []
    light_scenario_jsons = open(path, "r").readlines()
    for light_scenario_json in light_scenario_jsons:
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_metadata: dict = light_scenario_dict["light_scenario_key"]["metadata"]
        # if the light_scenarios are exported from helm, they will have a scenario_spec field
        if "scenario_spec" in light_scenario_metadata:
            light_scenario_metadata["scenario_spec"] = ScenarioSpec(**light_scenario_metadata["scenario_spec"])
        light_scenario_key = LightScenarioKey(metadata=light_scenario_metadata)
        light_instances: List[LightInstance] = [
            create_light_instance_from_dict(instance_dict) for instance_dict in light_scenario_dict["light_instances"]
        ]
        light_scenarios.append(LightScenario(light_scenario_key=light_scenario_key, light_instances=light_instances))
    return light_scenarios
light_scenarios = load_light_scenarios_from_jsonl('run_specs_filtered')
scenario_keys = []
for light_scenario in light_scenarios:
    scenario_keys.append(light_scenario.light_scenario_key)
from common.general import asdict_without_nones
with open('run_specs_filtered_keys', "w") as f:
    f.writelines(f"{json.dumps(asdict_without_nones(scenario_key))}\n" for scenario_key in scenario_keys)

FileNotFoundError: [Errno 2] No such file or directory: 'run_specs_filtered'