## Collect LLM response w.r.t questions from datasets


In [74]:
import numpy as np
from numpy import log2, prod, mean
import pandas as pd
from datasets import load_dataset
import torch
import random
import os
import openai
import json
from json import loads, dumps
import math
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import tqdm
import time
from tenacity import retry, stop_after_attempt, wait_random_exponential

load saved datasets

In [94]:
# load dataset 
load_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/data/"
dataset_name = "wiki_qa_train.csv"

df = pd.read_csv(load_path+dataset_name)
df[["prompt", "text", "token", "top_k_token", "top_k_prob", "top_logprobs", "prompt_tokens", "completion_tokens", "response"] ] = ""

print(f"dataset name: {dataset_name} \n" + 
      f"num of questions: {len(df)} \n" +
      f"dataset keys: {list(df.columns)}")

df.head(5)

dataset name: wiki_qa_train.csv 
num of questions: 873 
dataset keys: ['question', 'answer', 'prompt', 'text', 'token', 'top_k_token', 'top_k_prob', 'top_logprobs', 'prompt_tokens', 'completion_tokens', 'response']


Unnamed: 0,question,answer,prompt,text,token,top_k_token,top_k_prob,top_logprobs,prompt_tokens,completion_tokens,response
0,how are glacier caves formed?,A glacier cave is a cave formed within the ice...,,,,,,,,,
1,how much is 1 tablespoon of water,This tablespoon has a capacity of about 15 mL....,,,,,,,,,
2,how much are the harry potter movies worth,The series also originated much tie-in merchan...,,,,,,,,,
3,how a rocket engine works,"A rocket engine, or simply ""rocket"", is a jet ...",,,,,,,,,
4,how are cholera and typhus transmitted and pre...,Transmission occurs primarily by drinking wate...,,,,,,,,,


### Create text-completion

Model endpoint compatibility

*   v1/completions
    > text-davinci-003, text-davinci-002, text-curie-001, text-babbage-001, text-ada-001, davinci, curie, babbage, ada

In [None]:
openai.api_key = ""

In [30]:
model_name = "text-davinci-003"     # one model from v1/completions
num_question = len(df)  # num of questiosn feed into llm
new_df = df.copy()

# check validity of question num
assert num_question <= len(df), "Question number can not be greater than the num of samples in datasets'"

In [75]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

Send question to llm and get response

In [90]:
# feed questiosn to llm and get response

for i in tqdm.tqdm(np.arange(0, num_question)):

  question = df.iloc[i]["question"]
  # if i%100 == 0: print(i)
  # create the prompt
  prompt = "Answer the following question with reasons: \n\n Question:" + question 
  # print(prompt)

  # send the request to the API and get the response
  response = completion_with_backoff(
    model = model_name,
    prompt = prompt,    
    temperature = 0,        # Between 0 and 2, Higher values make the output more random, while lower values will make it more focused and deterministic.
    logprobs = 5,           # Include the log probabilities
    presence_penalty = 0,   # Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
    frequency_penalty = 0,  # Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
    max_tokens = 300,       # The maximum number of tokens to generate in the completion.
  )

  # format response
  new_response = json.loads(str(response))["choices"][0]
  top_logprobs = new_response["logprobs"]["top_logprobs"] # top-k prob for each token
  num_token_response = len(top_logprobs)                  # num of tokens in response
  top_k_prob = np.zeros([num_token_response, 5])          # array to hold all top-k prob [num_token, top_k_prob]
  top_k_scores = top_k_prob.copy()                        # top-k score for each token
  usgae = json.loads(str(response))["usage"]

  # tokens in response
  top_k_tokens = [list(content.keys()) for content in top_logprobs]
  top_k_tokens = np.array(top_k_tokens)

  # sort the order of top-k toekns, prob, socre follwing prob with decending order
  for index, content in enumerate(top_logprobs):
      top_k_prob[index, :] = np.sort(np.exp(np.array(list(top_logprobs[index].values()))))[::-1]
      top_k_scores[index, :] = np.sort(np.array(list(top_logprobs[index].values())))[::-1]
      new_index = np.argsort(np.exp(np.array(list(top_logprobs[index].values()))))[::-1]
      top_k_tokens[index, :] = top_k_tokens[index, new_index]

  # add LLM outputs to dataframe
  new_df.at[i, 'prompt'] = prompt                 # input prompt
  new_df.at[i, 'response'] = response             # original response
  new_df.at[i, 'text'] = new_response["text"]     # text answer
  new_df.at[i, 'token'] = top_k_tokens[:,0]       # token answer
  new_df.at[i, 'top_k_token'] = top_k_tokens      # top-k tokens
  new_df.at[i, 'top_k_prob'] = top_k_prob         # top-k prob
  new_df.at[i, 'top_logprobs'] = top_logprobs     # top-k token-prob pair
  new_df.at[i, 'prompt_tokens'] = usgae["prompt_tokens"]              # num tokens in prompt
  new_df.at[i, 'completion_tokens'] = usgae["completion_tokens"]      # num tokens in response

  # time.sleep(10)


100%|██████████| 1443/1443 [4:24:17<00:00, 10.99s/it]  


Save response

In [92]:
# save response df
df_to_save = new_df[new_df["text"]!=""]
name_to_save = "eli5_category_train"
save_path = "/Users/jiayangsong/Documents/git/LLM_analysis/hallucination_detection/response_data/"

# df_to_save.to_csv(save_path + "reponse_" + name_to_save + "_"+str(num_question)+".csv", index=False)


# df_to_save.to_pickle(save_path + "reponse_" + name_to_save + "_" + str(num_question)+".pkl")

# dumps(loads(df_to_save.to_json(save_path + "reponse_" + name_to_save + "_"+str(num_question) + ".json", 
#                        orient="records")), lines=True)  

# df_to_save.to_json(save_path + "reponse_" + name_to_save + "_"+str(num_question) + ".json", orient="columns")

df_to_save.to_json(save_path + "reponse_" + name_to_save + "_" + str(num_question) + ".json", orient="columns")

