In [16]:
import re
import nltk
from nltk.sem import logic
from nltk.sem import Expression

logic._counter._value = 0
read_expr = Expression.fromstring
prover = nltk.Prover9(10)


def convert_to_nltk_rep(logic_formula):
    translation_map = {
        "∀": "all ",
        "∃": "exists ",
        "→": "->",
        "¬": "-",
        "∧": "&",
        "∨": "|",
        "⟷": "<->",
        "↔": "<->",
        "0": "Zero",
        "1": "One",
        "2": "Two",
        "3": "Three",
        "4": "Four",
        "5": "Five",
        "6": "Six",
        "7": "Seven",
        "8": "Eight",
        "9": "Nine",
        ".": "Dot",
        "Ś": "S",
        "ą": "a",
        "’": "",
    }

    constant_pattern = r'\b([a-z]{2,})(?!\()'
    logic_formula = re.sub(constant_pattern, lambda match: match.group(1).capitalize(), logic_formula)

    for key, value in translation_map.items():
        logic_formula = logic_formula.replace(key, value)

    quant_pattern = r"(all\s|exists\s)([a-z])"
    def replace_quant(match):
        return match.group(1) + match.group(2) + "."
    logic_formula = re.sub(quant_pattern, replace_quant, logic_formula)

    dotted_param_pattern = r"([a-z])\.(?=[a-z])"
    def replace_dotted_param(match):
        return match.group(1)
    logic_formula = re.sub(dotted_param_pattern, replace_dotted_param, logic_formula)

    simple_xor_pattern = r"(\w+\([^()]*\)) ⊕ (\w+\([^()]*\))"
    def replace_simple_xor(match):
        return ("((" + match.group(1) + " & -" + match.group(2) + ") | (-" + match.group(1) + " & " + match.group(2) + "))")
    logic_formula = re.sub(simple_xor_pattern, replace_simple_xor, logic_formula)

    complex_xor_pattern = r"\((.*?)\)\) ⊕ \((.*?)\)\)"
    def replace_complex_xor(match):
        return ("(((" + match.group(1) + ")) & -(" + match.group(2) + "))) | (-(" + match.group(1) + ")) & (" + match.group(2) + "))))")
    logic_formula = re.sub(complex_xor_pattern, replace_complex_xor, logic_formula)

    special_xor_pattern = r"\(\(\((.*?)\)\)\) ⊕ (\w+\([^()]*\))"
    def replace_special_xor(match):
        return ("(((" + match.group(1) + ")) & -" + match.group(2) + ") | (-(" + match.group(1) + ")) & " + match.group(2) + ")")
    logic_formula = re.sub(special_xor_pattern, replace_special_xor, logic_formula)
    
    return logic_formula

def get_all_variables(text):
    pattern = r'\([^()]+\)'
    matches = re.findall(pattern, text)
    all_variables = []
    for m in matches:
        m = m[1:-1]
        m = m.split(",")
        all_variables += [i.strip() for i in m]
    return list(set(all_variables))

def reformat_fol(fol):
    translation_map = {
        "0": "Zero", 
        "1": "One",
        "2": "Two",
        "3": "Three",
        "4": "Four",
        "5": "Five",
        "6": "Six",
        "7": "Seven",
        "8": "Eight",
        "9": "Nine",
        ".": "Dot",
        "’": "",
        "-": "_",
        "'": "",
        " ": "_"
    }
    all_variables = get_all_variables(fol)
    for variable in all_variables:
        variable_new = variable[:]
        for k, v in translation_map.items():
            variable_new = variable_new.replace(k, v)
        fol = fol.replace(variable, variable_new)
    return fol

def evaluate(premises, conclusion):
    premises = [reformat_fol(p) for p in premises]
    conclusion = reformat_fol(conclusion)

    c = read_expr(conclusion)
    p_list = []
    for p in premises:
        p_list.append(read_expr(p))
    truth_value = prover.prove(c, p_list)
    if truth_value:
        return "True"
    else:
        neg_c = read_expr("-(" + conclusion + ")")
        negation_true = prover.prove(neg_c, p_list)
        if negation_true:
            return "False"
        else:
            return "Uncertain"


In [17]:
from collections import defaultdict
import math
import warnings

import torch
from torch.utils.data import IterableDataset
from tqdm import tqdm

INFILL_MODE = False


class TokenizedDataset(IterableDataset):
    """Tokenize and preprocess the dataset
    Multiple copies of the same prompt are sent sequentially.
    See compute_code for more details.
    """

    def __init__(
        self,
        task,
        dataset,
        tokenizer,
        num_devices,
        max_length,
        n_tasks=None,
        n_copies=1,
        prefix="",
    ):
        self.task = task
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.num_devices = num_devices
        self.max_length = max_length
        self.n_tasks = n_tasks
        self.n_copies = n_copies
        self.prefix = prefix

    def __iter__(self):
        prompts = []
        infill = []
        for sample in range(self.n_tasks):
            prompt_contents = self.task.get_prompt(self.dataset[sample])
            if isinstance(prompt_contents, str):
                infill.append(False)
                prompt = self.prefix + prompt_contents
            elif isinstance(prompt_contents, dict):
                assert set(prompt_contents.keys()) == {"prefix", "suffix"}
                infill.append(True)
                prompt = self.prefix + self._make_infill_prompt(**prompt_contents)
            else:
                raise ValueError(f"Unsupported prompt format: {type(prompt_contents)}")
            prompts.append(prompt)

        if not len(set(infill)) == 1:
            raise ValueError("Mixed infill and completion prompts are not supported.")
        global INFILL_MODE
        INFILL_MODE = infill[0]
        if INFILL_MODE:
            return_token_type_ids = False
        else:
            return_token_type_ids = None

        outputs = self.tokenizer(
            prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
            return_token_type_ids=return_token_type_ids,
        )

        if self.n_copies == 1 and self.n_tasks % self.num_devices != 0:
            self.n_copies = 2
            warnings.warn(
                "n_copies (n_samples/batch_size) was changed from 1 to 2 because n_tasks isn't proportional to num devices"
            )

        for sample in range(self.n_tasks):
            for _ in range(self.n_copies):
                yield {
                    "ids": outputs.input_ids[sample],
                    "task_id": sample,
                    "input_len": outputs.attention_mask[sample].sum(),
                }

    def _make_infill_prompt(self, prefix, suffix):
        """Make a prompt for infilling.
        Currently supported only for official InCoder and SantaCoder implementations.
        """
        model_id = self.tokenizer.name_or_path
        if model_id in ["facebook/incoder-1B", "facebook/incoder-6B"]:
            self.tokenizer.add_special_tokens({"pad_token": "<pad>"})
            return f"{prefix}<|mask:0|>{suffix}<|mask:0|>"
        elif model_id in ["bigcode/santacoder"]:
            return f"<fim-prefix>{prefix}<fim-suffix>{suffix}<fim-middle>"
        else:
            raise ValueError(f"Infilling not yet supported for: {model_id}")


def complete_code(
    task,
    accelerator,
    model,
    tokenizer,
    dataloader,
    n_tasks,
    batch_size=20,
    prefix="",
    postprocess=True,
    **gen_kwargs,
):
    """Generate multiple codes for each task in the dataset using multiple GPUs with accelerate.
    dataloader sends all the prompts from the evalution dataset to the model as the following:
    [p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1] where nc is the number of copies of the prompt,
    and nt is the number of tasks. nc is such that num_samples(for each task)= nc * batch_size
    """

    gen_token_dict = defaultdict(list)
    for step, batch in tqdm(
        enumerate(dataloader),
        total=math.ceil(
            n_tasks * dataloader.dataset.n_copies / accelerator.num_processes
        ),
    ):
        with torch.no_grad():
            if task.stop_words:
                gen_kwargs["stopping_criteria"][0].start_length = batch["ids"].shape[-1]
            generated_tokens = accelerator.unwrap_model(model).generate(
                input_ids=batch["ids"][:, : batch["input_len"]],
                num_return_sequences=batch_size,
                **gen_kwargs,
            )
            generated_tasks = batch["task_id"].repeat(batch_size)
            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens, generated_tasks = accelerator.gather(
                (generated_tokens, generated_tasks)
            )
            generated_tokens = generated_tokens.cpu().numpy()
            generated_tasks = generated_tasks.cpu().numpy()

            for sample, generated_tokens in zip(generated_tasks, generated_tokens):
                gen_token_dict[sample].append(generated_tokens)

    def parse_infill(code, tokenizer):
        """Reorder infill code and remove remaining special tokens."""
        model_id = tokenizer.name_or_path
        if model_id in ["facebook/incoder-1B", "facebook/incoder-6B"]:
            prefix, suffix, infill = code.split("<|mask:0|>", 2)
            infill = infill.split("<|endofmask|>")[0]
        elif model_id in ["bigcode/santacoder"]:
            prefix, rest = code.split("<fim-suffix>", 1)
            suffix, infill = rest.split("<fim-middle>", 1)
            infill = infill.split("<|endoftext|>")[0]
        else:
            raise ValueError(f"Infilling not yet supported for: {model_id}")
        code = "".join([prefix, infill, suffix])
        for k, v in tokenizer.special_tokens_map.items():
            if k == "additional_special_tokens":
                for t in v:
                    code = code.replace(t, "")
            else:
                code = code.replace(v, "")
        return code

    code_gens_raw = [[] for _ in range(n_tasks)]
    code_gens_prc = [[] for _ in range(n_tasks)]
    for sample, generated_tokens in gen_token_dict.items():
        for s in generated_tokens:
            if INFILL_MODE:
                gen_code = parse_infill(
                    tokenizer.decode(
                        s, skip_special_tokens=False, clean_up_tokenization_spaces=False
                    ),
                    tokenizer,
                )
            else:
                gen_code = tokenizer.decode(
                    s, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
            code_gens_raw[sample].append(gen_code[len(prefix) :])
            if postprocess:
                code_gens_prc[sample].append(
                    task.postprocess_generation(gen_code[len(prefix) :], int(sample))
                )
            else:
                warnings.warn(
                    "model output is not postprocessed, this might lower evaluation scores"
                )
                code_gens_prc[sample].append(gen_code[len(prefix) :])

    return code_gens_prc, code_gens_raw


In [33]:
from functools import cache
from collections import Counter
# from eval.tasks.utils import evaluate, convert_to_nltk_rep
from abc import abstractmethod, ABC
from datasets import load_dataset, Dataset
from warnings import warn

import pdb
class Task(ABC):
    """A task represents an entire benchmark including its dataset, problems,
    answers, generation settings and evaluation methods.
    """

    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
    DATASET_PATH: str = None

    # The name of a subset within `DATASET_PATH`.
    DATASET_NAME: str = None

    def __init__(self, stop_words=None, requires_execution=True):
        """
        :param stop_words: list
            list of stop words if the generation uses a stopping criteria during generation
        :param requires_execution: bool
            wheter the task requires code execution during evaluation or not
        """
        self.stop_words = stop_words
        self.requires_execution = requires_execution
        # pdb.set_trace()
        try:
            self.dataset = load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
        except:
            warn(
                "This task will use a locally downloaded dataset, not from the HF hub."
            )

    @abstractmethod
    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return []

    def fewshot_examples(self):
        """Loads and returns the few-shot examples for the task if they exist."""
        pass

    @abstractmethod
    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from.
        :param doc: dict[str: str]
            sample from the test dataset
        """
        pass

    @abstractmethod
    def get_reference(self, doc):
        """Builds the reference solution for the doc.
        :param doc: dict[str: str]
            sample from the test dataset
        """
        pass

    @abstractmethod
    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
        """
        pass

    @abstractmethod
    def process_results(self, generations, references):
        """Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations as in {"metric_name": result}.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing refrences
        :return: dict[str: float]
        """
        pass

import pdb

class OWAFOLTask(Task):
    """An OWA (Open World Assumption) FOL (First Order Logic) Task is a Task in which the goal
    is to generate True/False/Uncertain answers to First Order Logic questions.
    """

    TRAIN_DATASET_PATH = "metaeval/folio"
    ERROR_TOKEN = "Error"
    MAX_SHOTS = 16

    def __init__(self, mode, n):
        assert n <= self.MAX_SHOTS, f"supports up to {self.MAX_SHOTS}-shot"
        super().__init__(
            stop_words=["</EVALUATE>"], requires_execution=True,
        )
        self._mode = mode
        self._nshot = n
        # pdb.set_trace()
        self.train_dataset = Dataset.load_from_disk('/workspace/hyf/ananke/example/pipeline/logic_experiment/Fight_for_LINC/Chain-of-Context/metaeval_folio/train')#load_dataset(self.TRAIN_DATASET_PATH)["train"]
        self._train_dataset = self.reformat_fol_samples_train(self.train_dataset)
        self._train_dataset = self.add_conclusion_fols_train(self._train_dataset)
        self._train_dataset = self.add_cot_train(self._train_dataset)
        self._train_dataset = self._train_dataset.map(
            lambda x: {"label": "Uncertain" if x["label"] == "Unknown" else x["label"]},
            remove_columns=["label"],
        )
        self._train_fewshot_indices_all = [
            125,
            23,
            60,
            275,
            148,
            261,
            263,
            683,
            299,
            684,
            850,
            853,
            886,
            892,
            930,
            980,
        ]
        # Labels:
        # 23 (True), 60 (False), 125 (Uncertain), 148 (False), 261 (True), 263 (True), 275 (Uncertain), 683 (Uncertain)
        # 299 (True), 684 (False), 850 (False), 853 (Uncertain), 886 (True), 892 (Uncertain), 930 (False), 980 (False)

        self._train_fewshot_indices = self._train_fewshot_indices_all[:n]
        self._train = self._train_dataset.select(self._train_fewshot_indices)

    def reformat_fol_samples_train(self, train_dataset):
        def reformat_fol_sample(sample):
            sample["premises-FOL"] = [
                convert_to_nltk_rep(premise) for premise in sample["premises-FOL"]
            ]
            return sample

        return train_dataset.map(reformat_fol_sample)

    def add_conclusion_fols_train(self, train_dataset):
        train_conclusion_fols = {
            23: "HigherRank(RealMadrid, Barcelona)",
            60: "-OlympicGoldMedalWinner(Amy) -> NobelLaureate(Amy)",
            125: "-Dispensable(Worksheet)",
            148: "FolkSong(Inception)",
            261: "MakeGoodBreakfast(Luke)",
            263: "exists x. (Develops(Ets, x) & For(x, k-OneTwoandhighereducation)) & exists x. (Develops(Ets, x) & AssociatedWith(x, Entrytouseducationinstitutions))",
            275: "ContributeToCountry(James)",
            299: "GetRhythmRight(John)",
            683: "exists x. (BRICS(x) & Speaks(x, Hindi))",
            684: "Film(Hamilton)",
            850: "-Liked(Leo, Charlie) & -Cares(Charlie, Leo)",
            853: "Won(Threebodyproblem, Hugoaward)",
            886: "Dagfinn(DagfinnAarskog)",
            892: "PartOf(Minsk, Scottishpremiership)",
            930: "-Locate(Boves, Europe)",
            980: "(InvitedTakePhoto(James) & -HappyCommunicate(James)) | (-InvitedTakePhoto(James) & HappyCommunicate(James))",
        }
        conclusions = [None for _ in range(len(train_dataset))]
        for index, conclusion_fol in train_conclusion_fols.items():
            conclusions[index] = conclusion_fol
        train_dataset = train_dataset.add_column("conclusion-FOL", conclusions)
        return train_dataset

    def add_cot_train(self, train_dataset):
        train_cots = {
            23: "Let's think step by step. We want to evaluate if in La Liga 2021-2022, Real Madrid ranks higher than Barcelona. From premise 1, we know that a La Liga soccer team ranks higher than another if it receives more points. From premise 4, we know that in La Liga 2021-2022, Real Madrid received more points than Barcelona. Therefore, in La Liga 2021-2022, Real Madrid received more points than Barcelona, so Real Madrid ranks higher than Barcelona, so the statement is true.\nANSWER:\tTrue",
            60: "Let's think step by step. We want to evaluate the statement \"if Amy is not an Olympic gold medal winner, then Amy is a Nobel laureate\". Let's assume that Amy is not an Olympic gold medal winner. This doesn't tell us anything about whether Amy is a Nobel laureate, so the statement isn't true, meaning it is either False or Uncertain. To distinguish between the two, notice that we could have a scenario where Amy is neither an Olympic gold medal winner nor a Nobel laureate. None of the premises are violated in this case. This means the statement must be false.\nANSWER:\tFalse",
            125: "Let's think step by step. We want to evaluate if a worksheet is not dispensable. From premise 6, we know that a worksheet is either paper or is environment-friendly. If it is paper, then from premise 3, a worksheet is woodware, and from premise 2, a worksheet is dispensable. If it is environment-friendly, we know it is good from premise 5, but we know nothing about whether it is dispensable. Therefore, we don't know if a worksheet is dispensible or not, so the statement is uncertain.\nANSWER:\tUncertain",
            148: "Let's think step by step. We want to evaluate if Inception is a folk song. We know that Inception is a sci-fi movie. Since all movies are videos and Inception is a movie, it is a video, which means it is visual. On the other hand, we know that all folk songs are songs, and no songs are visual, so no folk songs are visual. Therefore, since Inception is visual but no folk songs are visual, we know that Inception cannot be a folk song, so the statement is false.\nANSWER:\tFalse",
            261: "Let's think step by step. We want to evaluate if Luke can make a good breakfast. From the last premise, we know that Luke can make cookies, scrambled eggs, and muffins. Since Luke can make cookies and muffins, they are a baker. Now, combining the information we have, since Luke is a baker and can make scrambled eggs, this means that they can make a good breakfast. Therefore, Luke can make a good breakfast, so the statement is true.\nANSWER:\tTrue",
            263: "Let's think step by step. We want to evaluate if ETS develops assessments for K-12 statewide as well as entry to US tertiary and quaternary educatiand doon institutions. We know that ETS develops assessments for K-12 statewide. We also know that ETS develops assessments associated with entry to the US tertiary and quaternary education institutes. Therefore, both parts of the conclusion are true, and the statement is true.\nANSWER:\tTrue",
            275: "Let's think step by step. We want to evaluate if James contributes to the country. Let's think about what we know about James. First, we know that James was either sentenced for thief or stayed in prison. However, this doesn't tell us anything about whether James contributed to the country. Second, we know that James either had a bad record in the local state or that he was respected by others. However, the premises don't tell us anything about the relationship between having a bad record and contributing to the country. Therefore, it is uncertain whether James contributes to the country.\nANSWER:\tUncertain",
            299: "Let's think step by step. We want to evaluate if John can get the rhythms right. We know that John is a student learning piano. Since all students learning piano can strike the right notes, John can strike the right notes. Since all students who can strike the right notes can get the rhythms right and John can strike the right notes, John can get the rhythms right, so the conclusion is true.\nANSWER:\tTrue",
            683: "Let's think step by step. We want to evaluate if there is a person from BRICS speaking Hindi. We know that there is an Indian, and since India is one of BRICS, we know that there is an Indian in BRICS. Furthermore, we know that they speak either Hindi or English, however, we don't know which one. Therefore, there could be a person in BRICS speaking Hindi, or there could not. Therefore, it is uncertain whether there is a person from BRICS speaking Hindi.\nANSWER:\tUncertain",
            684: "Let's think step by step. We want to evaluate if Hamilton is a film. Since Daveed Diggs played two roles in the musical Hamilton, Hamilton is a musical. Since musicals are not films and Hamilton is a musical, Hamilton is not a film, and the conclusion is false.\nANSWER:\tFalse",
            850: "Let's think step by step. We want to evaluate if Charlie does not like Leo and does not care for Leo. Let's first evaluate if Charlie does not like Leo. We know Charlie has a naughty pet named Leo. Since pets who are naughty are not liked as much, Charlie does not like Leo. Now, let's evaluate if Charlie cares for Leo. We know that if a person has a pet, they care for that pet. Since Leo is Charlie's pet, Charlie cares for Leo. Therefore, Charlie does not like Leo but cares for Leo, so the second part of the conclusion is false, which means the entire conclusion is false.\nANSWER:\tFalse",
            853: "Let's think step by step. We want to evaluate if the Three Body Problem won the Hugo Award. The only thing we know about the Hugo Award is that some books that have won the Hugo Award were written by Cixin Liu. However, we know nothing about whether The Three Body Problem was written by Cixin Liu, so the conclusion is uncertain.\nANSWER:\tUncertain",
            886: "Let's think step by step. We want to evaluate if Dagfinn is Dagfinn Aarskog's given name. We know that Dagfinn is a given name, and that notable people with the given name Dagfinn includes Dagfinn Aarskog, which means that Dagfinn is Dagfinn Aarskog's given name, so the conclusion is true.\nANSWER:\tTrue",
            892: "Let's think step by step. We want to evaluate if Minsk joined the Scottish Premiership. We know that Minsk and St Johnstone are different teams and that St Johnstone is part of the Scottish Premiership, but we don't know anything about whether or not Minsk joined the Scottish Premiership from the premises. Therefore, the conclusion is uncertain.\nANSWER:\tUncertain",
            930: "Let's think step by step. We want to evaluate if Boves is not in Europe. We know that Boves is a railway station located in France. We also know that since France is a European country, France is located in Europe. Furthermore, we know that if A is located in B and B is located in C, then A is located in C. Therefore, we know that because Boves is located in France and France is located in Europe, that means Boves is located in Europe. Therefore, the conclusion is false.\nANSWER:\tFalse",
            980: "Let's think step by step. We want to evaluate if James is either invited to take a photo with the audience or happy to communicate with each other during the dinner. We know that James does not attend the conference in person and is not provided with souvenirs. There are no premises that apply to people who do not attend the conference. Since James is not provided with souvenirs, since all who attended the conference in person are provided with souvenirs, we know that James did not attend the conference in person. However, we don't know anything else, so it is possible that James was neither invited to take a photo with the audience nor happy to communicate during the dinner. Therefore, the conclusion is false.\nANSWER:\tFalse",
        }
        cots = [None for _ in range(len(train_dataset))]
        for index, cot in train_cots.items():
            cots[index] = cot
        train_dataset = train_dataset.add_column("cot", cots)
        return train_dataset

    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return self._test

    def get_instructions(self):
        instructions = ""
        instructions += "The following is a first-order logic (FOL) problem.\n"
        instructions += "The problem is to determine whether the conclusion follows from the premises.\n"
        instructions += "The premises are given in the form of a set of first-order logic sentences.\n"
        instructions += "The conclusion is given in the form of a single first-order logic sentence.\n"
        if self._mode == "baseline":
            instructions += f"The task is to evaluate the conclusion as 'True', 'False', or 'Uncertain' given the premises."
        else:
            instructions += "The task is to translate each of the premises and conclusions into FOL expressions, "
            if self._mode == "scratchpad":
                instructions += f"and then to evaluate the conclusion as 'True', 'False', or 'Uncertain' given the premises."
            elif self._mode == "neurosymbolic":
                instructions += "so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.\n"
                instructions += "Expressions should be adhere to the format of the Python NLTK package logic module."
        return instructions + "\n\n"

    def format_train_example(self, doc):
        example = self.format_test_example(doc)
        if self._mode == "baseline":
            example += f"{doc['label'].strip()}\n"
        elif self._mode == "cot":
            example += f"{doc['cot']}\n"
        else:
            for premise, fol in zip(doc["premises"], doc["premises-FOL"]):
                example += f"TEXT:\t{premise.strip()}\nFOL:\t{fol.strip()}\n"
            example += f"TEXT:\t{doc['conclusion'].strip()}\nFOL:\t{doc['conclusion-FOL'].strip()}\n"
            if self._mode == "scratchpad":
                example += f"ANSWER:\t{doc['label'].strip()}\n"
        return example + "</EVALUATE>\n"

    def format_test_example(self, doc):
        example = "<PREMISES>\n"
        for premise in doc["premises"]:
            example += f"{premise.strip()}\n"
        example += "</PREMISES>\n"
        example += f"<CONCLUSION>\n{doc['conclusion'].strip()}\n</CONCLUSION>\n"
        example += "<EVALUATE>\n"
        return example

    def get_prompt(self, doc):
        """
        Builds the prompt for the LM to generate from.
        :param doc: dict[str: str]
            sample from the test dataset
        :return: str
        """
        instructions = self.get_instructions()
        train = self.fewshot_examples()
        test = self.format_test_example(doc)
        prompt = "\n".join([instructions, train, test])
        return prompt

    def get_reference(self, doc):
        """
        Builds the reference solution for the doc (sample from the test dataset).
        :param doc: dict[str: str]
            sample from the test dataset
        :return: str
        """
        return doc["label"]

    def postprocess_generation(self, generation, idx, completion_only=False):
        """
        Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int (if needed)
            index of doc in the dataset to which the generation belongs
        :return: str
        """
        try:
            if completion_only:
                gen = generation.strip()
            else:
                prefix = self.get_prompt(self.get_dataset()[idx])
                assert generation.startswith(
                    prefix
                ), "Increase `--max_length_generation` to avoid truncation"
                gen = generation[len(prefix) :].strip()
                for stop_word in self.stop_words:
                    gen = gen.split(stop_word)[0].strip()
            if self._mode == "baseline":
                resp = gen.strip()
            elif self._mode == "scratchpad":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            elif self._mode == "neurosymbolic":
                flag = "FOL:"
                parses = [
                    line.replace(flag, "").strip()
                    for line in gen.split("\n")
                    if flag in line
                ]
                premises, conclusion = parses[:-1], parses[-1]
                resp = evaluate(premises, conclusion)
            elif self._mode == "cot":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            else:
                raise ValueError(f"Invalid mode: {self._mode}")
            assert resp in ["True", "False", "Uncertain"], f"Invalid generation: {resp}"
            return resp
        except Exception as e:
            # TODO: explore failure cases and improve postprocessing
            print(f"Error in parsing and/or evaluating LLM output: {e}")
            return self.ERROR_TOKEN

    @staticmethod
    def metric(generations, references, error_token):
        correct = 0
        for gens, ref in zip(generations, references):
            gens = [gen for gen in gens if gen != error_token]
            if len(gens) > 0:
                majority = Counter(gens).most_common(1)[0][0]
                if majority == ref:
                    correct += 1
        return {f"accuracy (pass@1 majority)": correct / len(references)}

    def process_results(self, generations, references):
        """
        Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations as in {"metric_name": result}.
        We encourage to directly load the metric from `evaluate` library to keep the code concise.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing refrences
        :return: dict[str: float]
        """
        return self.metric(generations, references, self.ERROR_TOKEN)

    @cache
    def fewshot_examples(self):
        """
        Returns a few-shot example for the task.
        :param n: int
            number of examples
        :param seed: int
            seed for random number generator
        :return: str
        """
        examples = []
        for doc in self._train.select(range(self._nshot)):
            examples.append(self.format_train_example(doc))
        return "\n".join(examples)


In [19]:
"""
FOLIO: Natural Language Reasoning with First-Order Logic
https://arxiv.org/pdf/2209.00840.pdf
"""
# from eval.base import OWAFOLTask
# from eval.tasks.utils import evaluate, convert_to_nltk_rep

_CITATION = """
@article{han2022folio,
  title={Folio: Natural language reasoning with first-order logic},
  author={Han, Simeng and Schoelkopf, Hailey and Zhao, Yilun and Qi, Zhenting and Riddell, Martin and Benson, Luke and Sun, Lucy and Zubova, Ekaterina and Qiao, Yujie and Burtell, Matthew and others},
  journal={arXiv preprint arXiv:2209.00840},
  year={2022}
}
"""


def create_all_tasks():
    def create_task(mode, n):
        class FOLIO(FOLIOBase):
            def __init__(self):
                super().__init__(mode, n)

        return FOLIO

    return {
        f"folio-{mode}-{n}shot": create_task(mode, n)
        for mode in ["baseline", "scratchpad", "neurosymbolic", "cot"]
        for n in [1, 2, 4, 8, 16]
    }


class FOLIOBase(OWAFOLTask):
    DATASET_PATH = "benlipkin/folio"
    DATASET_NAME = None

    def __init__(self, mode, n, seed=7):
        super().__init__(mode, n)
        # process validation dataset
        self._dataset = self.reformat_fol_samples(self.dataset["validation"]).shuffle(seed)
        self._test = self._dataset.select(range(0, len(self._dataset)))

    def reformat_fol_samples(self, dataset):
        def reformat_fol_sample(sample):
            sample["premises-FOL"] = [
                convert_to_nltk_rep(premise) for premise in sample["premises-FOL"]
            ]
            sample["conclusion-FOL"] = convert_to_nltk_rep(sample["conclusion-FOL"])
            try:
                assert len(sample["premises"]) == len(sample["premises-FOL"])
                label = evaluate(sample["premises-FOL"], sample["conclusion-FOL"])
                
                assert sample["label"] == label
                print(label)
            except Exception as e:
                # print(f"Error in parsing FOL: {e}")
                # print(sample)
                sample["label"] = self.ERROR_TOKEN
            return sample

        return dataset.map(reformat_fol_sample).filter(
            lambda x: x["label"] != self.ERROR_TOKEN
        )

In [20]:
class ProofWriterBase(OWAFOLTask):
    DATASET_PATH = "theoxo/proofwriter-deduction-balanced"
    DATASET_NAME = None

    def __init__(self, mode, n, seed=7):
        super().__init__(mode, n)
        self._test = self.reformat(self.dataset["test"]).shuffle(seed)


    def reformat(self, dataset):

        def punctuate(s):
            if s[-1] not in [".", "?", "!"]:
                s += "."
            return s

        def reformat_sample(sample):
            sample["premises"] = [punctuate(p) for p in sample.pop("theory").split(". ")]
            sample["conclusion"] = punctuate(sample.pop("question"))
            sample["label"] = sample.pop("answer")
            return sample

        return dataset.map(reformat_sample)

class FOLIOBase(OWAFOLTask):
    DATASET_PATH = "benlipkin/folio"
    DATASET_NAME = None

    def __init__(self, mode, n, seed=7):
        super().__init__(mode, n)
        # process validation dataset
        self._dataset = self.reformat_fol_samples(self.dataset["validation"]).shuffle(seed)
        self._test = self._dataset.select(range(0, len(self._dataset)))

    def reformat_fol_samples(self, dataset):
        def reformat_fol_sample(sample):
            sample["premises-FOL"] = [
                convert_to_nltk_rep(premise) for premise in sample["premises-FOL"]
            ]
            sample["conclusion-FOL"] = convert_to_nltk_rep(sample["conclusion-FOL"])
            try:
                assert len(sample["premises"]) == len(sample["premises-FOL"])
                label = evaluate(sample["premises-FOL"], sample["conclusion-FOL"])
                assert sample["label"] == label
            except Exception as e:
                # print(f"Error in parsing FOL: {e}")
                # print(sample)
                sample["label"] = self.ERROR_TOKEN
            return sample

        return dataset.map(reformat_fol_sample)
        # .filter(
        #     lambda x: x["label"] != self.ERROR_TOKEN
        # )

In [21]:
def folio_create_all_tasks():
    def create_task(mode, n):
        class FOLIO(FOLIOBase):
            def __init__(self):
                super().__init__(mode, n)

        return FOLIO

    return {
        f"folio-{mode}-{n}shot": create_task(mode, n)
        for mode in ["baseline", "scratchpad", "neurosymbolic", "cot"]
        for n in [1, 2, 4, 8, 16]
    }

def proofwriter_create_all_tasks():
    def create_task(mode, n):
        class ProofWriter(ProofWriterBase):
            def __init__(self):
                super().__init__(mode, n)

        return ProofWriter

    return {
        f"proofwriter-{mode}-{n}shot": create_task(mode, n)
        for mode in ["baseline", "scratchpad", "neurosymbolic", "cot"]
        for n in [1, 2, 4, 8, 16]
    }

In [22]:

TASK_REGISTRY = {
    **folio_create_all_tasks(),
    **proofwriter_create_all_tasks(),
}

ALL_TASKS = sorted(list(TASK_REGISTRY))


def get_task(task_name):
    try:
        return TASK_REGISTRY[task_name]()
    except KeyError:
        print("Available tasks:")
        pprint(TASK_REGISTRY)
        raise KeyError(f"Missing task {task_name}")

In [23]:
# ALL_TASKS
# proofwriter_neurosymbolic_1shot = get_task('proofwriter-neurosymbolic-1shot')
TASK_REGISTRY
folio_neurosymbolic_1shot_task = get_task('folio-neurosymbolic-1shot')
folio_neurosymbolic_1shot_dataset = folio_neurosymbolic_1shot_task.get_dataset()
folio_neurosymbolic_1shot_dataset

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Dataset({
    features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
    num_rows: 204
})

In [31]:
# folio_neurosymbolic_1shot_dataset
print(folio_neurosymbolic_1shot_task.get_prompt(folio_neurosymbolic_1shot_dataset[0]))
#if a worksheet is not dispensable. From premise 6, we know that a worksheet is either paper or is environment-friendly. If it is paper, then from premise 3, a worksheet is woodware, and from premise 2, a worksheet is dispensable. If it is environment-friendly, we know it is good from premise 5, but we know nothing about whether it is dispensable. Therefore, we don't know if a worksheet is dispensible or not, so the statement is uncertain.\nANSWER:\tUncertain",


The following is a first-order logic (FOL) problem.
The problem is to determine whether the conclusion follows from the premises.
The premises are given in the form of a set of first-order logic sentences.
The conclusion is given in the form of a single first-order logic sentence.
The task is to translate each of the premises and conclusions into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.
Expressions should be adhere to the format of the Python NLTK package logic module.


<PREMISES>
All dispensable things are environment-friendly.
All woodware is dispensable.
All paper is woodware.
No good things are bad.
All environment-friendly things are good.
A worksheet is either paper or is environment-friendly.
</PREMISES>
<CONCLUSION>
A worksheet is not dispensable.
</CONCLUSION>
<EVALUATE>
TEXT:	All dispensable things are environment-friendly.
FOL:	all x. (Dispensable(x) -> EnvironmentFriendly(x)

In [26]:
folio_neurosymbolic_1shot_task.get_instructions()

'The following is a first-order logic (FOL) problem.\nThe problem is to determine whether the conclusion follows from the premises.\nThe premises are given in the form of a set of first-order logic sentences.\nThe conclusion is given in the form of a single first-order logic sentence.\nThe task is to translate each of the premises and conclusions into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.\nExpressions should be adhere to the format of the Python NLTK package logic module.\n\n'

In [27]:
folio_neurosymbolic_1shot_task.fewshot_examples()

'<PREMISES>\nAll dispensable things are environment-friendly.\nAll woodware is dispensable.\nAll paper is woodware.\nNo good things are bad.\nAll environment-friendly things are good.\nA worksheet is either paper or is environment-friendly.\n</PREMISES>\n<CONCLUSION>\nA worksheet is not dispensable.\n</CONCLUSION>\n<EVALUATE>\nTEXT:\tAll dispensable things are environment-friendly.\nFOL:\tall x. (Dispensable(x) -> EnvironmentFriendly(x))\nTEXT:\tAll woodware is dispensable.\nFOL:\tall x. (Woodware(x) -> Dispensable(x))\nTEXT:\tAll paper is woodware.\nFOL:\tall x. (Paper(x) -> Woodware(x))\nTEXT:\tNo good things are bad.\nFOL:\tall x. (Good(x) -> -Bad(x))\nTEXT:\tAll environment-friendly things are good.\nFOL:\tall x. (EnvironmentFriendly(x) -> Good(x))\nTEXT:\tA worksheet is either paper or is environment-friendly.\nFOL:\t((Paper(Worksheet) & -EnvironmentFriendly(Worksheet)) | (-Paper(Worksheet) & EnvironmentFriendly(Worksheet)))\nTEXT:\tA worksheet is not dispensable.\nFOL:\t-Dispensa

In [30]:
print(folio_neurosymbolic_1shot_task.format_train_example(folio_neurosymbolic_1shot_task._train[0]))

<PREMISES>
All dispensable things are environment-friendly.
All woodware is dispensable.
All paper is woodware.
No good things are bad.
All environment-friendly things are good.
A worksheet is either paper or is environment-friendly.
</PREMISES>
<CONCLUSION>
A worksheet is not dispensable.
</CONCLUSION>
<EVALUATE>
TEXT:	All dispensable things are environment-friendly.
FOL:	all x. (Dispensable(x) -> EnvironmentFriendly(x))
TEXT:	All woodware is dispensable.
FOL:	all x. (Woodware(x) -> Dispensable(x))
TEXT:	All paper is woodware.
FOL:	all x. (Paper(x) -> Woodware(x))
TEXT:	No good things are bad.
FOL:	all x. (Good(x) -> -Bad(x))
TEXT:	All environment-friendly things are good.
FOL:	all x. (EnvironmentFriendly(x) -> Good(x))
TEXT:	A worksheet is either paper or is environment-friendly.
FOL:	((Paper(Worksheet) & -EnvironmentFriendly(Worksheet)) | (-Paper(Worksheet) & EnvironmentFriendly(Worksheet)))
TEXT:	A worksheet is not dispensable.
FOL:	-Dispensable(Worksheet)
</EVALUATE>



In [None]:
examples = []
for doc in self._train.select(range(self._nshot)):
    examples.append(self.format_train_example(doc))
return "\n".join(examples)

task = tasks.get_task(task_name)
dataset = task.get_dataset()
n_tasks = self.args.limit if self.args.limit else len(dataset)
prompts = [task.get_prompt(dataset[i]) for i in range(n_tasks)]
stops = [task.stop_words for _ in range(n_tasks)]

In [None]:
def postprocess_generation(self, generation, idx, completion_only=False):
        """
        Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int (if needed)
            index of doc in the dataset to which the generation belongs
        :return: str
        """
        try:
            if completion_only:
                gen = generation.strip()
            else:
                prefix = self.get_prompt(self.get_dataset()[idx])
                assert generation.startswith(
                    prefix
                ), "Increase `--max_length_generation` to avoid truncation"
                gen = generation[len(prefix) :].strip()
                for stop_word in self.stop_words:
                    gen = gen.split(stop_word)[0].strip()
            if self._mode == "baseline":
                resp = gen.strip()
            elif self._mode == "scratchpad":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            elif self._mode == "neurosymbolic":
                flag = "FOL:"
                parses = [
                    line.replace(flag, "").strip()
                    for line in gen.split("\n")
                    if flag in line
                ]
                premises, conclusion = parses[:-1], parses[-1]
                resp = evaluate(premises, conclusion)
            elif self._mode == "cot":
                flag = "ANSWER:"
                resp = gen.split(flag)[-1].strip()
            else:
                raise ValueError(f"Invalid mode: {self._mode}")
            assert resp in ["True", "False", "Uncertain"], f"Invalid generation: {resp}"
            return resp
        except Exception as e:
            # TODO: explore failure cases and improve postprocessing
            print(f"Error in parsing and/or evaluating LLM output: {e}")
            return self.ERROR_TOKEN

In [87]:
task_names = ALL_TASKS
get_task('folio-baseline-16shot')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Filter:   0%|          | 0/204 [00:00<?, ? examples/s]

<__main__.folio_create_all_tasks.<locals>.create_task.<locals>.FOLIO at 0x7fbd871f6290>

In [75]:
folo_base = FOLIOBase('neurosymbolic',3)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Filter:   0%|          | 0/204 [00:00<?, ? examples/s]

In [51]:
# from datasets import load_dataset, Dataset
# Dataset.load_from_disk('/workspace/hyf/ananke/example/pipeline/logic_experiment/Fight_for_LINC/Chain-of-Context/metaeval_folio/train')

folo_base._test

Dataset({
    features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
    num_rows: 0
})

In [76]:
# !pip install ipywidgets
folo_base

In [None]:
# import os
# 

# # proxy = 'http://<user>:<pass>@<proxy>:<port>'
# proxy = 'http://192.168.1.123:7890'
# proxys =  'https://192.168.1.123:7890'
# os.environ['http_proxy'] = proxy 
# os.environ['HTTP_PROXY'] = proxy
# os.environ['https_proxy'] = proxys
# os.environ['HTTPS_PROXY'] = proxys

In [61]:
from datasets import load_dataset
load_dataset('./benlipkin/folio')


DatasetDict({
    validation: Dataset({
        features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
        num_rows: 204
    })
})

In [None]:

folo_base = FOLIOBase('neurosymbolic',3)