In [12]:
import json
import os
import glob

from typing import List, Tuple, Set, DefaultDict
from nltk import ngrams
from typing import Dict
from tqdm import tqdm
from collections import defaultdict

from light_scenario import LightInstance, LightScenario, LightScenarioKey
from data_overlap_spec import (
    DataOverlapStats,
    DataOverlapStatsKey,
    OverlapProtocolSpec,
    EntryDataOverlapKey,
    EntryOverlapNgrams,
)
from light_tokenizer import LightTokenizer
from load_documents import get_document_iterator
from common.hierarchical_logger import hlog, htrack_block
from common.general import asdict_without_nones
from common.arguments import get_data_overlap_args
from common.util import get_tokenizer
from scenarios.scenario import ScenarioSpec


# The n values of the ngrams to be computed
N_VALUES: List[int] = [5, 9, 13]  # TODO: Pick the N values

PART_INPUT: str = "input"
PART_REF: str = "references"


# type alias for overlap-related data structures
Ngram = Tuple[str, ...]
NgramIndex = Dict[int, Dict[Ngram, Set[EntryDataOverlapKey]]]
NgramCounter = Dict[EntryDataOverlapKey, Dict[Ngram, int]]


def load_light_scenarios_from_jsonl(path: str) -> List[LightScenario]:
    """
    Create a list of light scenarios from a jsonl file, where each json represents a LightScenario object.

    Input file format:

    Instance JSON 1
    Instance JSON 2
    Instance JSON 3
    ...
    """

    def create_light_instance_from_dict(instance_dict: dict) -> LightInstance:
        return LightInstance(
            input=instance_dict[PART_INPUT], references=instance_dict[PART_REF], id=instance_dict["id"]
        )

    light_scenarios: List[LightScenario] = []
    light_scenario_jsons = open(path, "r").readlines()
    for light_scenario_json in light_scenario_jsons:
        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        # if the light_scenarios are exported from helm, they will have a scenario_spec field
        scenario_spec = ScenarioSpec(**light_scenario_key_dict["scenario_spec"])
        light_scenario_key = LightScenarioKey(scenario_spec=scenario_spec, split=light_scenario_key_dict["split"])
        light_instances: List[LightInstance] = [
            create_light_instance_from_dict(instance_dict) for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append(LightScenario(scenario_key=light_scenario_key, instances=light_instances))
    return light_scenarios


In [23]:
path = './data/xa/xaa'
path = 'scenario_data_new'
# path = 'input.json'
# light_scenario_jsons = open(path, "r").readlines()
# print(light_scenario_jsons[0])
# print('hi')
light_scenarios = load_light_scenarios_from_jsonl(path)

In [24]:
light_scenario_dict = dict()
for light_scenario in light_scenarios:
    light_scenario_dict[light_scenario.scenario_key] = light_scenario

In [25]:
light_scenarios[0].instances[0]

LightInstance(input='Which of the following did the post-war welfare state of 1948 not aim to provide:', references=['free health care and education for all', 'a minimum wage', 'full employment', 'universal welfare'], id='id0')

In [29]:
filtered_scenarios = []
for light_scenario in light_scenarios:
#     print(light_scenario.scenario_key)
#     break
    if light_scenario.scenario_key.scenario_spec.class_name == 'helm.benchmark.scenarios.legal_summarization_scenario.LegalSummarizationScenario':
        filtered_scenarios.append(light_scenario)

In [33]:
scenario = filtered_scenarios[1]

In [36]:
filtered_instance = None
for instance in scenario.instances:
    if instance.id == 'id119':
        filtered_instance = instance
        break

In [38]:
filtered_instance.references

['Online intermediation services — fairness and transparency for business users Online intermediation services — fairness and transparency for business users SUMMARY OF: Regulation (EU) 2019/1150 on promoting fairness and transparency for business users of online intermediation services WHAT IS THE AIM OF THE REGULATION? It aims to ensure the fair and transparent treatment of business users by online platforms, giving them more effective options for redress when they face problems, creating a predictable and innovation-friendly regulatory environment for online platforms within the EU. KEY POINTS Scope The regulation introduces new rules for online intermediation services* (online platforms) and online search engines that aim to connect EU businesses and professional websites with EU consumers. Online platforms cover a wide range of activities including: online marketplaces;social media and creative content outlets;application distribution platforms;price comparison websites;collaborat