In [1]:
from typing import List

from compute_data_overlap_metrics import (
    get_all_data_overlap_stats,
    # create_all_data_overlap_stats,
    create_ngram_index,
    EntryDataOverlapKey,
    Ngram,
    NgramIndex,
    # AllDataOverlapStats,
)
from light_scenario import LightScenario, LightInstance, LightScenarioKey
from light_tokenizer import LightTokenizer, DefaultTokenizer
from data_overlap_stats import (
    DataOverlapStats,
    DataOverlapStatsKey,
    PART_INPUT,
    PART_REF,
)
from common.general import asdict_without_nones
from scenarios.scenario import ScenarioSpec

N_VALUES = [5, 13]

TEST_DOCUMENT: str = (
    "The Center for Research on Foundation Models (CRFM) is "
    "an interdisciplinary initiative born out of the Stanford "
    "Institute for Human-Centered Artificial Intelligence (HAI) "
    "that aims to make fundamental advances in the study, development, "
    "and deployment of foundation models."
)

TEST_TOKENS_SPLIT_BY_SPACE: List[str] = [
    "The",
    "Center",
    "for",
    "Research",
    "on",
    "Foundation",
    "Models",
    "(CRFM)",
    "is",
    "an",
    "interdisciplinary",
    "initiative",
    "born",
    "out",
    "of",
    "the",
    "Stanford",
    "Institute",
    "for",
    "Human-Centered",
    "Artificial",
    "Intelligence",
    "(HAI)",
    "that",
    "aims",
    "to",
    "make",
    "fundamental",
    "advances",
    "in",
    "the",
    "study,",
    "development,",
    "and",
    "deployment",
    "of",
    "foundation",
    "models.",
]

TEST_TOKENS_BY_DEFAULT_TOKENIZER: List[str] = [
    "the",
    "center",
    "for",
    "research",
    "on",
    "foundation",
    "models",
    "crfm",
    "is",
    "an",
    "interdisciplinary",
    "initiative",
    "born",
    "out",
    "of",
    "the",
    "stanford",
    "institute",
    "for",
    "human",
    "centered",
    "artificial",
    "intelligence",
    "hai",
    "that",
    "aims",
    "to",
    "make",
    "fundamental",
    "advances",
    "in",
    "the",
    "study",
    "development",
    "and",
    "deployment",
    "of",
    "foundation",
    "models",
    "",
]

TEST_SCENARIO_1 = LightScenario(
    scenario_key=LightScenarioKey(scenario_spec=ScenarioSpec(class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}), split="test"),
    instances=[
        LightInstance(input="Center for Research on Foundation", references=["bar", "baz"], id="id1"),
        LightInstance(input="bar bar", references=["foo", "baz"], id="id2"),
    ],
)
TEST_SCENARIO_2 = LightScenario(
    scenario_key=LightScenarioKey(scenario_spec=ScenarioSpec(class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}), split="test"),
    instances=[LightInstance(input=TEST_DOCUMENT, references=[TEST_DOCUMENT, TEST_DOCUMENT], id="id1")],
)

from typing import List

from compute_data_overlap_metrics import (
    get_all_data_overlap_stats,
    create_ngram_index,
    EntryDataOverlapKey,
    Ngram,
    NgramIndex,
)
from data_overlap_spec import OutputDataOverlapStats, OutputDataOverlapStatsKey, OverlapProtocolSpec
from light_scenario import LightScenario, LightInstance, LightScenarioKey
from light_tokenizer import LightTokenizer, DefaultTokenizer
from data_overlap_stats import (
    DataOverlapStatsKey,
    PART_INPUT,
    PART_REF,
)
from common.general import asdict_without_nones
from scenarios.scenario import ScenarioSpec



In [2]:
tokenizer = LightTokenizer()
scenarios = [TEST_SCENARIO_1, TEST_SCENARIO_2]
# all_overlap_stats: AllDataOverlapStats
ngram_index: NgramIndex
# all_overlap_stats = create_all_data_overlap_stats(light_scenarios=scenarios, n_values=N_VALUES)
ngram_index = create_ngram_index(light_scenarios=scenarios, n_values=N_VALUES, tokenizer=tokenizer)

all_data_overlap_stats = get_all_data_overlap_stats(
    ngram_index=ngram_index
)


Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test')
Building ngram indexes for LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test')


In [3]:
print(all_data_overlap_stats)

[OutputDataOverlapStats(output_data_overlap_stats_key=OutputDataOverlapStatsKey(light_scenario_key=LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test'), overlap_protocol_spec=OverlapProtocolSpec(N=5)), instance_ids_with_overlapping_input=['id1'], instance_ids_with_overlapping_reference=['id1']), OutputDataOverlapStats(output_data_overlap_stats_key=OutputDataOverlapStatsKey(light_scenario_key=LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test'), overlap_protocol_spec=OverlapProtocolSpec(N=13)), instance_ids_with_overlapping_input=['id1'], instance_ids_with_overlapping_reference=['id1'])]


In [4]:
ngram_index[5]

{('Center',
  'for',
  'Research',
  'on',
  'Foundation'): {EntryDataOverlapKey(stats_key=DataOverlapStatsKey(metadata={'light_scenario_key': LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test'), 'N': 5}), part='input', instance_id='id1', index=0), EntryDataOverlapKey(stats_key=DataOverlapStatsKey(metadata={'light_scenario_key': LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test'), 'N': 5}), part='reference', instance_id='id1', index=0)},
 ('The',
  'Center',
  'for',
  'Research',
  'on'): {EntryDataOverlapKey(stats_key=DataOverlapStatsKey(metadata={'light_scenario_key': LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test'), 'N': 5}), part='input', instance_id='id1', index=0), EntryDataOverlapKey(stats_key=DataOve

In [5]:
ALL_DATA_OVERLAP_STATS = [
        OutputDataOverlapStats(
            output_data_overlap_stats_key=OutputDataOverlapStatsKey(
                light_scenario_key=LightScenarioKey(
                    scenario_spec=ScenarioSpec(
                        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}
                    ),
                    split="test",
                ),
                overlap_protocol_spec=OverlapProtocolSpec(N=13),
            ),
            instance_ids_with_overlapping_input=["id1"],
            instance_ids_with_overlapping_reference=["id1"],
        ),
        OutputDataOverlapStats(
            output_data_overlap_stats_key=OutputDataOverlapStatsKey(
                light_scenario_key=LightScenarioKey(
                    scenario_spec=ScenarioSpec(
                        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}
                    ),
                    split="test",
                ),
                overlap_protocol_spec=OverlapProtocolSpec(N=5),
            ),
            instance_ids_with_overlapping_input=["id1"],
            instance_ids_with_overlapping_reference=["id1"],
        ),
    ]


In [6]:
ALL_DATA_OVERLAP_STATS2 = [
        OutputDataOverlapStats(
            output_data_overlap_stats_key=OutputDataOverlapStatsKey(
                light_scenario_key=LightScenarioKey(
                    scenario_spec=ScenarioSpec(
                        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}
                    ),
                    split="test",
                ),
                overlap_protocol_spec=OverlapProtocolSpec(N=13),
            ),
            instance_ids_with_overlapping_input=["id1"],
            instance_ids_with_overlapping_reference=["id1"],
        ),
        OutputDataOverlapStats(
            output_data_overlap_stats_key=OutputDataOverlapStatsKey(
                light_scenario_key=LightScenarioKey(
                    scenario_spec=ScenarioSpec(
                        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={}
                    ),
                    split="test",
                ),
                overlap_protocol_spec=OverlapProtocolSpec(N=5),
            ),
            instance_ids_with_overlapping_input=["id1"],
            instance_ids_with_overlapping_reference=["id1"],
        ),
    ]

In [7]:
ALL_DATA_OVERLAP_STATS2 == ALL_DATA_OVERLAP_STATS

True

In [8]:
ALL_DATA_OVERLAP_STATS2[0].output_data_overlap_stats_key == ALL_DATA_OVERLAP_STATS[0].output_data_overlap_stats_key

True

In [9]:
ALL_DATA_OVERLAP_STATS2[0].instance_ids_with_overlapping_input == ALL_DATA_OVERLAP_STATS[0].instance_ids_with_overlapping_input

True

In [10]:
stats_key2=ALL_DATA_OVERLAP_STATS2[0].output_data_overlap_stats_key
stats_key=ALL_DATA_OVERLAP_STATS[0].output_data_overlap_stats_key

In [11]:
stats_key.overlap_protocol_spec == stats_key2.overlap_protocol_spec

True

In [12]:
lsk= stats_key.light_scenario_key 

In [13]:
lsk2= stats_key2.light_scenario_key

In [14]:
ss = lsk.scenario_spec

In [15]:
ss2 = lsk2.scenario_spec

In [16]:
ss == ss2

True

In [17]:
lsk

LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test')

In [18]:
lsk2

LightScenarioKey(scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario', args={}), split='test')

In [19]:
lsk == lsk2

True

In [20]:
self = lsk

In [21]:
other = lsk2

In [22]:
self.split == other.split and self.scenario_spec.class_name == other.scenario_spec.class_name and self.scenario_spec.args == other.scenario_spec.args

True