In [None]:
# Run this cell if you are using google colab
!pip install openai

In [1]:
from openai import OpenAI
# Uncomment the following line to get the API key from the user data
# if you run this in colab
# from google.colab import userdata
# OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# if you run this in local
import os
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')


In [2]:
from datetime import datetime

def get_current_timestamp():
  """
  Returns the current timestamp in the format "YYYY-MM-DD hh:mm:ss".
  """
  now = datetime.now()
  return now.strftime("%Y-%m-%d %H:%M:%S")



In [3]:
from datetime import datetime

def get_current_date():
  """
  Returns the current date in the format "YYYY-MM-DD".
  """
  now = datetime.now()
  return now.strftime("%Y-%m-%d")



In [4]:
# Uncomment the following line if you are using google colab
# from google.colab import drive

import json

def save_dict_to_drive(data, filename):
  """Saves a dictionary to a JSON file in Google Drive.

  Args:
    data: The dictionary to save.
    filename: The name of the file to save to (e.g., 'my_data.json').
  """
  # Uncomment the following lines if you are using google colab
  # drive.mount('/content/drive')
  # filepath = f'/content/drive/MyDrive/Colab/{filename}'

  # If you are not using google colab, you can save the file in the current directory
  filepath = f'./output/{filename}'
  with open(filepath, 'w') as f:
    json.dump(data, f, indent=4)
  print(f"Dictionary saved to: {filepath}")

def load_dict_from_drive(filename):
  """Loads a dictionary from a JSON file in Google Drive.

  Args:
    filename: The name of the file to load from (e.g., 'my_data.json').

  Returns:
    The loaded dictionary, or None if the file is not found.
  """
  # Uncomment the following lines if you are using google colab
  # drive.mount('/content/drive')
  # filepath = f'/content/drive/MyDrive/Colab/{filename}'

  # If you are not using google colab, you can load the file from the current directory
  filepath = f'./output/{filename}'
  try:
    with open(filepath, 'r') as f:
      data = json.load(f)
    print(f"Dictionary loaded from: {filepath}")
    return data
  except FileNotFoundError:
    print(f"File not found: {filepath}")
    return None


# Example usage:
# my_dict = {'key1': 'value1', 'key2': 'value2'}
# save_dict_to_drive(my_dict, 'my_dictionary.json')

# loaded_dict = load_dict_from_drive('my_dictionary.json')
# if loaded_dict:
#   print(loaded_dict)


In [5]:
import requests
import json

# Replace with the actual URL of your JSON file on GitHub
github_url = "https://raw.githubusercontent.com/google/BIG-bench/refs/heads/main/bigbench/benchmark_tasks/causal_judgment/task.json"

try:
  response = requests.get(github_url)
  response.raise_for_status()  # Raise an exception for bad status codes

  data = json.loads(response.text)

  # Now you can work with the parsed JSON data
  # print(data)

except requests.exceptions.RequestException as e:
  print(f"Error downloading or parsing JSON: {e}")


In [7]:
llm_methods = {
  "zero_shot": {
      "prompt": """
You are an AI language model specialized in determining causal relationships. When presented with a question about causality, analyze the information carefully and provide a clear, concise response. Your answer should:

Reasoning: Begin with a brief explanation of whether and how the cause leads to the effect.
Conclusion: End with "The answer is Yes." or "The answer is No." based on your analysis.
""" ,
      "file_name": "tot_step2"
  },
  "cot": {
      "prompt": """
You are an AI language model tasked with answering questions about causation from the perspective of a typical person. When presented with a causation question, please follow these steps:

1. **Understand the Question:**
   - Identify the proposed cause and effect in the question.
   - Determine what is being asked and any common assumptions involved.

2. **Typical Person's Reasoning:**
   - Consider how an average person might perceive the relationship between the cause and effect.
   - Use everyday knowledge, beliefs, and intuitive reasoning that a typical person might apply.
   - Provide a brief explanation reflecting this common-sense reasoning.

3. **Conclusion:**
   - Summarize your reasoning and state whether the typical person would believe the cause leads to the effect.
   - End your response with "The answer is Yes." or "The answer is No." based on this perspective.

      """,
      "file_name": "cot"

  },
}


In [8]:
# Define the method to use
method = "zero_shot"
llm_params = llm_methods[method]

In [9]:
model_params = {
    "model": "gpt-4o-mini",
    "messages": [
        {"role": "system", "content": llm_params['prompt']},
        {
            "role": "user",
            "content": "example['input']"
        }
    ]
}

print(model_params['messages'][0])


{'role': 'system', 'content': '\nYou are an AI language model specialized in determining causal relationships. When presented with a question about causality, analyze the information carefully and provide a clear, concise response. Your answer should:\n\nReasoning: Begin with a brief explanation of whether and how the cause leads to the effect.\nConclusion: End with "The answer is Yes." or "The answer is No." based on your analysis.\n'}


In [10]:
llm_output = {
    "description": "Answers of the model for the set of causal questions",
    "url": "https://github.com/google/BIG-bench/blob/main/bigbench/benchmark_tasks/causal_judgment/task.json",
    "model_params": model_params,
    "timretamp": get_current_timestamp(),
    "results" : []
}

In [11]:
data = load_dict_from_drive('llm_output_tot_step1_2024-10-15.json')

Dictionary loaded from: ./output/llm_output_tot_step1_2024-10-15.json


In [13]:
import re
import copy
llm_output['results'] = []
client = OpenAI(
    api_key = OPENAI_API_KEY,
)

for idx, example in enumerate(data['results']):
  print(f"Processing example {idx+1}/{len(data['results'])}")
  q = example['thoughts'] + "\n\n" + example['input_question']
#  print(example['input'])
#  for answer in example['target_scores']:
#    if example['target_scores'][answer] == 1:
#      print(answer)
#  print('---')
#  print(example['target_scores'])

  completion = client.chat.completions.create(
      model=model_params['model'],
      messages=[
          model_params['messages'][0],
          {
              "role": "user",
              "content": example['input']
          }
      ]
  )
  response = completion.choices[0].message.content
  #print(response)
  llm_answer = re.findall(r"The answer is (Yes|No)\b",response, re.IGNORECASE)
  llm_answers = {
          "Yes":0,
          "No":0
        }

  for posible_answer in llm_answers:
    if posible_answer in llm_answer:
      llm_answers[posible_answer] = 1

  if len(llm_answer) == 1:
    score = example['target_scores'][llm_answer[0]]
  else:
    score = 0
  node = {
      "input": q,
      "target_scores": example['target_scores'],
      "llm_full_answer": response,
      "llm_answer": llm_answers,
      "score": score
  }
  llm_output['results'].append(copy.deepcopy(node))
  if idx == 2:
    # Uncomment to stop the execution after the first 2 questions
     #break
    pass


Processing example 1/190
Processing example 2/190
Processing example 3/190
Processing example 4/190
Processing example 5/190
Processing example 6/190
Processing example 7/190
Processing example 8/190
Processing example 9/190
Processing example 10/190
Processing example 11/190
Processing example 12/190
Processing example 13/190
Processing example 14/190
Processing example 15/190
Processing example 16/190
Processing example 17/190
Processing example 18/190
Processing example 19/190
Processing example 20/190
Processing example 21/190
Processing example 22/190
Processing example 23/190
Processing example 24/190
Processing example 25/190
Processing example 26/190
Processing example 27/190
Processing example 28/190
Processing example 29/190
Processing example 30/190
Processing example 31/190
Processing example 32/190
Processing example 33/190
Processing example 34/190
Processing example 35/190
Processing example 36/190
Processing example 37/190
Processing example 38/190
Processing example 39

In [14]:
print("Let's check if there are unparsed resonses from the LLM")
unparsed_responses = 0
for idx, task in enumerate(llm_output['results']):
  _cnt = 0
  for a in task['llm_answer']:
    _cnt += task['llm_answer'][a]
  if _cnt != 1:
    print(f"We could not determine the LLM for {_cnt} questions, check this tasks:")
    print(idx, task)
    unparsed_responses += 1

if unparsed_responses == 0:
  print("All responses were parsed correctly")
else:
  print(f"We could not determine the LLM for {unparsed_responses} questions")
  print("You need to manually check the answers for these questions")


Let's check if there are unparsed resonses from the LLM
All responses were parsed correctly


In [15]:
# Let's calculate the performance
score_cnt = 0

for idx, task in enumerate(llm_output['results']):
  score_cnt += task['score']

print(f"The performance of the model zero shot is {score_cnt/len(llm_output['results']) * 100:.2f}%")


The performance of the model zero shot is 69.47%


In [28]:
llm_output['performance'] = score_cnt/len(llm_output['results'])

In [29]:
import os.path
from os import path

date = get_current_date()
file_name = f"llm_output_{llm_params['file_name']}_{date}.json"

save_dict_to_drive(llm_output, file_name)

Dictionary saved to: ./output/llm_output_cot_2024-10-14.json
