<a href="https://colab.research.google.com/github/YifanChao/pandas/blob/main/CS263_HW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS 263 HW Notebook

This notebook comprises of python scripts to extract chatgpt output and instructions for checking the format of your files for the submission of the homework.

## Extract ChatGPT Output

In [6]:
import re
import json

def argument_extractor(chatgpt_output, argument_roles=None):
    # Define a regex pattern to capture roles and their corresponding arguments
    pattern = r"(?P<role>[A-Z][a-z]+): \"(?P<argument>[^\"]+)\""

    # Use regex to find all matches in the ChatGPT output
    matches = re.finditer(pattern, chatgpt_output)

    # Dictionary to store the extracted arguments
    extracted_arguments = {}

    # Iterate over each match found
    for match in matches:
        role = match.group('role')
        argument = match.group('argument')

        # Check if a specific set of argument roles is provided for filtering
        if argument_roles is None or role in argument_roles:
            if role in extracted_arguments:
                # Ensure multiple entries for the same role are stored in a list
                if type(extracted_arguments[role]) is list:
                    extracted_arguments[role].append(argument)
                else:
                    extracted_arguments[role] = [extracted_arguments[role], argument]
            else:
                extracted_arguments[role] = argument

    # Convert the dictionary of extracted arguments to a JSON string
    return json.dumps(extracted_arguments, indent=4)

# Example usage of the function
chatgpt_output = 'Analysis: Event Name: Infect Event Trigger Word: "positive" Argument Roles and Corresponding Arguments: Patient: "someone" Location: "a church" Time: "the next day"'
# Define specific argument roles to extract, or set to None to extract all
specific_roles = []  # Adjust this list based on your needs

# Call the function with the output and specific roles
extracted_arguments_json = argument_extractor(chatgpt_output, specific_roles)
print(extracted_arguments_json)


{}


## File-check for formatting

In [None]:
import json

EVENT_NAMES = ["infect", "spread", "symptom", "cure", "prevent", "control", "death"]
ONTOLOGY_FIELD_NAMES = {"event_name": str, "argument_role": str, "role_description": str, "example_sentence": str}
DATA_ANNOTATION_FIELD_NAMES = {"input_text": str, "event_name": str, "event_trigger": str, "arguments": dict}
PREDICTION_FIELD_NAMES = {"input_text": str, "prompt": str, "output_text": str, "extracted_arguments": dict}
BREAKGPT_FIELD_NAMES = {"input_text": str, "event_name": str, "event_trigger": str, "prompt": str, "output_text": str, "extracted_arguments": dict, "expected_arguments": dict}

def ontology_check(filename):
  data = None
  with open(filename, 'r') as f:
    try:
      data = json.load(f)
    except:
      print ("ERROR: File is not a json file. Use json.dump to create your file")
      return

  for i, dt in enumerate(data):
    for field_name in dt.keys():
      if field_name not in ONTOLOGY_FIELD_NAMES:
        print ("ERROR: Line %d: field name %s is incorrect. It should be within %s" % (i+1, field_name, str(ONTOLOGY_FIELD_NAMES.keys())))
        return

    if dt["event_name"] not in EVENT_NAMES:
      print ("ERROR: line %d has unknown event name %s. Please check." % (i+1, dt["event_name"]))
      return

  print ("PASSED: The format of the file %s looks correct!" % filename)
  return

def json_check(filename, required_field_names, is_logs=0):
  data = None
  with open(filename, 'r') as f:
    try:
      data = json.load(f)
    except:
      print ("ERROR: File is not a json file. Use json.dump to create your file")
      return

  for i, dt in enumerate(data):
    for field_name in dt.keys():
      if field_name not in required_field_names.keys():
        print ("ERROR: Line %d: field name %s is incorrect. It should be within %s" % (i+1, field_name, str(required_field_names.keys())))
        return

    for var, typ in required_field_names.items():
      if not isinstance(dt[var], typ):
        print ("ERROR: Line %d: dt['%s'] is not a %s" % (i+1, var, str(typ)))
        return

    if "arguments" in required_field_names and "input_text" in required_field_names:
      for role, arg in dt["arguments"].items():
        if isinstance(arg, str) and arg not in dt["input_text"]:
          print ("ERROR: Line %d: argument '%s' not in the input text. Make sure your argument is in the input text" % (i+1, arg))
          return
        elif isinstance(arg, list):
          for a in arg:
            assert isinstance(a, str)
            if a not in dt["input_text"]:
              print ("ERROR: Line %d: argument '%s' not in the input text. Make sure your argument is in the input text" % (i+1, a))
              return

    if "expected_arguments" in required_field_names and "input_text" in required_field_names:
      for role, arg in dt["expected_arguments"].items():
        if isinstance(arg, str) and arg not in dt["input_text"]:
          print ("ERROR: Line %d: expected argument '%s' not in the input text. Make sure your argument is in the input text" % (i+1, arg))
          return
        elif isinstance(arg, list):
          for a in arg:
            assert isinstance(a, str)
            if a not in dt["input_text"]:
              print ("ERROR: Line %d: argument '%s' not in the input text. Make sure your argument is in the input text" % (i+1, a))
              return

    if "extracted_arguments" in required_field_names and "output_text" in required_field_names and not is_logs:
      if argument_extractor(dt["output_text"]) != dt["extracted_arguments"]:
        print ("ERROR: Line %d: extracted arguments is inconsistent with chatgpt output based on script" % (i+1))
        return

  print ("PASSED: The format of the file %s looks correct!" % filename)
  return

def check_all_file_format():
  ontology_check("ontology.json")
  json_check("in_context-annotated.json", DATA_ANNOTATION_FIELD_NAMES)
  json_check("eval_data-annotated.json", DATA_ANNOTATION_FIELD_NAMES)
  json_check("logs.json", PREDICTION_FIELD_NAMES, is_logs=1)
  json_check("pred.json", PREDICTION_FIELD_NAMES)
  json_check("break-gpt.json", BREAKGPT_FIELD_NAMES)

In [None]:
check_all_file_format()