In [None]:
import openai
import ray
import pandas as pd
import time
from dotenv import load_dotenv
from langchain.chat_models import ChatAnyscale, ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import os
from multichoice import Multichoice
from prompt_mgr import PromptMgr

In [None]:
load_dotenv('.env')
oai_key = os.environ['OPENAI_API_KEY']
ae_key = os.environ['AE_API_KEY']

In [None]:

df = pd.read_json('resources/datasets/val_sentence_pairs.json')

In [None]:
model_short_names = {'openai://gpt-3.5-turbo' : 'gpt35', 
                    'openai://gpt-4': 'gpt4', 
                    'meta-llama/Llama-2-7b-chat-hf': 'llama7', 
                    'meta-llama/Llama-2-13b-chat-hf': 'llama13',
                    'meta-llama/Llama-2-70b-chat-hf': 'llama70'
                   }

In [None]:
models_to_test = ['openai://gpt-3.5-turbo', 'openai://gpt-4', 'meta-llama/Llama-2-7b-chat-hf', 
                  'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-70b-chat-hf']

In [None]:
# You can also try answer_last, or reduce_bias here as well. Results do not substantially change. 
pm = PromptMgr(src_dir = 'resources/environments/answer_first/prompts')

In [None]:
def query_model(row, model_name, prompt_mgr, swap_answers = False):
    if swap_answers: 
        prompt = prompt_mgr.bind('consistent').render(
                    article_sent=row['article_sent'], 
                    option_a=row['incorrect_sent'],
                    option_b=row['correct_sent'])
    else: 
        prompt = prompt_mgr.bind('consistent').render(
                    article_sent=row['article_sent'], 
                    option_a=row['correct_sent'],
                    option_b=row['incorrect_sent'])

    system_prompt = prompt_mgr.bind('system').render()
    if model_name.startswith('openai://'): 
        model_name = model_name.replace('openai://','')
        # Needs lots of retries due to rate limiting. 
        model = ChatOpenAI(model_name=model_name, openai_api_key = oai_key, temperature = 0, max_retries = 35)
    else: 
        model = ChatAnyscale(model_name=model_name, anyscale_api_key = ae_key, temperature = 0, max_retries = 35)
                             
    messages = [SystemMessage(content=system_prompt),
                HumanMessage(content=prompt)]
    output = model(messages)
    return {'output': output.content } 
     

In [None]:
# Let's test this. 
result = query_model(df.loc[0], models_to_test[0], pm, swap_answers = True)

In [None]:
result

In [None]:
class ModelQuery:
    def __init__(self, model, pm, swap_answers = False):
        self.model = model
        self.pm = pm
        self.swap_answers = swap_answers
        
    def __call__(self, row):
        return query_model(row, self.model, self.pm, self.swap_answers)

In [None]:
@ray.remote
def convert_to_pandas(ds):
    return ds.to_pandas()

In [None]:
num_shards = 3 # Reasonable number. We could split more finely if we wanted to. 
num_cpus = 0.1 # A guess at the overhead of making these calls concurrently. 
ds_by_model = [None] * len(models_to_test)*2
ds = ray.data.from_pandas(df).repartition(num_shards)
for i in range(len(models_to_test)): 
    # We set up two tasks at the same time one for the normal and one for the swap. 
    # Each instance has to have its own copy of the prompt manager to ensure they don't overwrite one another. 
    mq = ModelQuery(models_to_test[i], pm)
    ds_by_model[i]= ds.map(mq, num_cpus=num_cpus)
    
    mq_swap = ModelQuery(models_to_test[i], pm, swap_answers=True)
    ds_by_model[i+len(models_to_test)] = ds.map(mq_swap, num_cpus=num_cpus)
    

In [None]:

st = time.time() 
futures = [convert_to_pandas.remote(ds) for ds in ds_by_model]

results = ray.get(futures) 
et = time.time()
print(f'Gathering results took {et-st} wall clock seconds.')
# Typical time is about 700 seconds on a g5.12xlarge 
# Expect a few internal server errors, bad gateways, rate limits etc. 

In [None]:
# Now assign the results to the table (we had to put both in the ray.get() call -- so the 
# first half is unswapped, second half is swapped. 

for i in range(len(models_to_test)):
    df[model_short_names[models_to_test[i]]] = results[i]
for i in range(len(models_to_test), 2*len(models_to_test)):
    df[model_short_names[models_to_test[i-len(models_to_test)]]+'-swap'] = results[i]

In [None]:
# This is a good point to save your queries and after this just focus on the data processing without having to requery. 

In [None]:
df.to_json('llm_fact.json')

In [None]:
df = pd.read_json('llm_fact.json')

In [None]:
class Cleaner:
    """ This cleaning class helps with 
    cleanin a given column with the given prompt manager and 
    Anyscale Endpoints key.
    """
    def __init__(self, col, pm, ae_key):
        self.col = col
        self.pm = pm
        self.ae_key = ae_key
        
    def __call__(self, row):
        mc = Multichoice(self.pm, self.ae_key)
        output =  mc.extract_choice(row[self.col])
        return {'output': output}

In [None]:
to_relabel = ['gpt4', 'gpt4-swap', 'gpt35', 'gpt35-swap', 'llama7', 'llama7-swap', 'llama13', 'llama13-swap', 'llama70', 'llama70-swap']


In [None]:
num_shards = 10 # Reasonable number. We could split more finely if we wanted to. 
num_cpus = 0.1 # A guess at the overhead of making these calls concurrently. 
cleaned = [None] * len(to_relabel)
ds = ray.data.from_pandas(df).repartition(num_shards)
for i in range(len(to_relabel)): 
    # We set up two tasks at the same time one for the normal and one for the swap. 
    col = to_relabel[i] 
    cleaner = Cleaner(col, pm, ae_key)
    cleaned[i]= ds.map(cleaner, num_cpus=0.1)

In [None]:
st = time.time() 
futures = [convert_to_pandas.remote(ds) for ds in cleaned]
results = ray.get(futures) 
et = time.time()
print(f'Cleaning took {et-st} wall clock seconds.')
# Typical time is about 700 seconds on a g5.12xlarge 
# Expect a few internal server errors, bad gateways, rate limits etc. 

In [None]:
for i in range(len(to_relabel)):
    df[to_relabel[i]+'-clean'] = results[i]

In [None]:


def is_correct(row): 
    if (row[0] == 'A' and row[1] == 'B'):
        return 'Y'
    if (row[0] == row[1]):
        return row[0]*2
    return 'N'

In [None]:
df['gpt4-clean']

In [None]:
for sn in ['gpt35', 'gpt4', 'llama7', 'llama13', 'llama70']:
    df[sn+'-correct'] = df[[sn +'-clean', sn+'-swap-clean']].apply(is_correct, axis=1)

In [None]:
df[['gpt35', 'gpt35-swap', 'gpt35-clean', 'gpt35-swap-clean', 'gpt35-correct']]

In [None]:
def acc_bias(col):
    results = df[col].value_counts()/len(df)
    acc = results['Y']
    if 'AA' not in results:
        results['AA'] = 0
    if 'BB' not in results: 
        results['BB'] = 0
        
    bias = abs(results['AA'] - results['BB'])
    aa_ratio = results['AA'] * 100
    bb_ratio = results['BB'] * 100
    if results['AA'] > results['BB']:
        towards = 'A'
    else:
        towards = 'B'
        
    return acc, aa_ratio, bb_ratio, bias, towards

In [None]:
for m in ['gpt35', 'gpt4', 'llama7', 'llama13', 'llama70']:
    acc, aa_ratio, bb_ratio, bias, towards = acc_bias(m+'-correct')
    acc = acc*100
    bias = bias*100
    print(f'{m}:   \tAccuracy: {acc:.1f}%  \tAA: {aa_ratio:.1f}%\tBB: {bb_ratio:.1f}%\tBias: {bias:.1f}% towards {towards}')