In [None]:
!pip install openai
!pip install ray
!pip install optuna
!pip install sentence-transformers
!pip install tqdm
!pip install dimod
!pip install dwave-neal

In [1]:
import openai
from openai import OpenAI
import json
import hashlib
import os
import numpy as np
import ray
from ray import train, tune
import optuna
import re
import random
import itertools
from sentence_transformers import SentenceTransformer
import tqdm
import math
import dimod, neal, time
from os import listdir
from os.path import isfile, join
import pandas as pd
import matplotlib.pyplot as plt

def formattedReasons(raw_res, rounder=0):
    ret_d=dict()
    not_pref="N*: "
    tot_sum=0
    for r_res in raw_res:
        weight_v = int(r_res.split("_")[-1])
        access_f="_".join(r_res.split("_")[:-1])
        tot_sum+=math.pow(2, weight_v)
        if access_f in ret_d:
            ret_d[access_f]+=math.pow(2, weight_v)
        else:
            ret_d[access_f]=math.pow(2, weight_v)
    tmp_l=[]
    for d_key, d_val in ret_d.items():
        tmp_val = d_val/tot_sum
        if d_key[:len(not_pref)]==not_pref:
            tmp_val*=-1
            d_key=d_key[len(not_pref):]
        if rounder:
            tmp_val=round(tmp_val, rounder)
        if abs(tmp_val)>0:
            tmp_l.append("("+str(tmp_val)+") "+d_key)
    return sorted(tmp_l, reverse=True)

def simulatedAnnealing(BQM, n_sweeps, n_reads):
    sampler = neal.SimulatedAnnealingSampler()
    start_time=time.perf_counter()
    samples = sampler.sample(BQM, num_sweeps = n_sweeps, num_reads = n_reads) 
    end_time=time.perf_counter()
    time_spent=end_time-start_time

    # getting best reasons selected
    soln = np.array(list(samples.first.sample.values()))
    best_reasons_DWSA = [k for k, v in samples.first.sample.items() if v == 1]
    best_energy = samples.first.energy
    return best_reasons_DWSA, time_spent, best_energy

def decode(prompt, systemInstruction, temperature, numSamples, model = "gpt-3.5-turbo", maxTokens = 350):
    # please add your api_key below
    client = OpenAI(api_key = "ADD KEY HERE")
    if isinstance(prompt, list): 
        message = [{"role": "user", "content": prompt[i]} for i in range(len(prompt))]
    else:
        message = [{"role": "user", "content": prompt}]
    
    if systemInstruction != None:
         message = [{"role": "system", "content": systemInstruction}] + message

    completion = client.chat.completions.create(
      model=model,
      messages= message,
      n = numSamples,
      temperature = temperature,
      max_tokens = maxTokens,
    )
    return [completion.choices[i].message.content for i in range(numSamples)]

def formatResults(path):
    tasks = [f[:-5] for f in listdir("bbhFiles/") if isfile(join("bbhFiles/", f))]
    df = pd.read_csv(path)

    results = dict()
    for i in range(len(tasks)):
        indices = df['Dataset'] == tasks[i]
        results[tasks[i]] = df["Correctness"][indices].sum()/indices.sum()

    results["tracking_shuffled_objects"] = np.mean([results[task] for task in tasks if task[:8] == "tracking"])
    results["logical_deduction"] = np.mean([results[task] for task in tasks if task[:7] == "logical"])
    results = { k:v for k, v in results.items() if (k[:26] != "tracking_shuffled_objects_" and k[:18] != "logical_deduction_")}
    return results

remoteDecode = ray.remote(decode)

In [2]:
class combinatorialReasoner:
    def __init__(self):
        self.simThreshold = 0.9
        self.model = SentenceTransformer('all-mpnet-base-v2')


    def sample(self, dataset, questionNo, forceResample = False):
        if not os.path.isfile(f"bbhFiles/{dataset}/question_{questionNo}.npy") or forceResample:
            print("Sampling Question")
            data = json.load(open("bbhFiles/" + dataset + ".json"))
            question = data["examples"][questionNo]["input"]

            resultIDs = [remoteDecode.remote(["Output template: \nStep 1: reasoning \n{condensed reason}\nStep 2: reasoning \n{condensed reason} ....", question],
                                        "Let's think step by step. After each step, condense the reasoning in the step into a sentence and put it in curly braces.", 
                                        temperature = 1., 
                                        numSamples = 70,
                                        maxTokens = 1000)
                                        for i in range(3)]
            results = np.array(ray.get(resultIDs))
            np.save(f"bbhFiles/{dataset}/question_{questionNo}.npy", results)
        
        res = np.load(f"bbhFiles/{dataset}/question_{questionNo}.npy")
        reasonSamples = {}
        for i in range(res.shape[0]):
            for j in range(res.shape[1]):
                a = re.findall(r'\{.*?\}', res[i][j])
                a = [k for k in a if ((len(k) > 15))]
                if a:
                    reasonSamples[(i, j)] = a
        return reasonSamples, res

    def answer(self, hyperparams, dataset, questionNo):
        if not isfile(f"QUBOs/{dataset}/qubo_{questionNo}.npy"):
            raise Exception("Cannot find the QUBO for this question, please check the path.")
        
        BQM = np.load(f"QUBOs/{tasks[i]}/qubo_{testing[i, j]}.npy", allow_pickle = True).item()
        bestReasons, timeSpent, bestEnergy = simulatedAnnealing(BQM, n_sweeps = 1000, n_reads = 100) # Choose number of sweeps and reads here
        weightedBestReasons = formattedReasons(bestReasons, 3)

        data = json.load(open("bbh_testing/bbhFiles/" + dataset + ".json"))
        question = data["examples"][questionNo]["input"]
        answer = data["examples"][questionNo]["target"]
        prompt = "Q: " + question + "\nW-Statements:\n" + "\n".join(weightedBestReasons)
        systemInstruction ="Each W-Statement starts with the substring (w), where (w) is a number called the W-Value. Identify and state each W-Value. W-Statements with higher W-Values have more reliable information. You may not provide multiple possible answers, you must narrow your final solution down to a single answer. Refer to each W-Statement and their W-Values in your reasoning. Your final answer must be of the form SOLUTION: (option)."
        response = decode(prompt, systemInstruction, temperature=0., numSamples = 1, maxTokens=500)[0]
        if answer in response.split("SOLUTION:")[-1]:
            return [1, response, answer, dataset, questionNo, hyperparams, bestReasons, bestEnergy, timeSpent, prompt]
        else:
            return [0, response, answer, dataset, questionNo, hyperparams, bestReasons, bestEnergy, timeSpent, prompt]

In [16]:
tasks = ['causal_judgement',
         'reasoning_about_colored_objects',
         'navigate',
         'penguins_in_a_table',
         'geometric_shapes',
         'disambiguation_qa',
         'tracking_shuffled_objects_five_objects',
         'word_sorting',
         'tracking_shuffled_objects_three_objects',
         'tracking_shuffled_objects_seven_objects',
         'multistep_arithmetic_two',
         'web_of_lies',
         'logical_deduction_three_objects',
         'sports_understanding',
         'snarks',
         'logical_deduction_five_objects',
         'salient_translation_error_detection',
         'hyperbaton',
         'movie_recommendation',
         'object_counting',
         'logical_deduction_seven_objects',
         'temporal_sequences',
         'formal_fallacies',
         'dyck_languages',
         'date_understanding',
         'boolean_expressions',
         'ruin_names']
testing = np.load("bbhQuestionNumbers.npy")
cr = combinatorialReasoner()

iterator = list(itertools.product(range(27), range(50)))

# uncomment below line if you want to randomly iterate through the evaluation set
#random.shuffle(iterator)

records = []
for i, j in tqdm.tqdm(iterator):
    hyperparams = {'linearSensitivity': 53.1543,
               'threshParam': -1.90906,
               'riskParam': 1.37128,
               'weight': 2}
    record = cr.answer(hyperparams, tasks[i], testing[i, j])
    records. append(record)

pathToResults = "reproducitbility.csv"
df = pd.DataFrame(records, columns = ["Correctness", 'LLM Response', 'Answer', 'Dataset', 'Question Number', "Hyperparameters", "Best Reasons", "Best Energy", "Time Spent", "LLM Prompt"])
df.to_csv(pathToResults)

  cur_risk_lookup[tmp_n]=np.sqrt((1-cur_val)*cur_val)*risk_p
100%|██████████| 1350/1350 [15:07<00:00,  1.49it/s]


In [None]:
""" Dictionary of automatically parsed accuracies. Will be correct for most multiple choice datasets,
    but fails for dyck languages, word sorting, and a few others"""

crResultsDict = formatResults(pathToResults)
crResultsDict