In [33]:
import json
import re

In [34]:
with open('datasets/winogrande_train_gpt4_response.json', 'r') as f:
    data = json.loads(f.read())

In [35]:
len(data)

1000

In [36]:
print(data['GO3wkLp9TC']['response'])

```json
{
  "concepts": [
    {
      "name": "John",
      "description": "A person who performs the action of moving the couch."
    },
    {
      "name": "couch",
      "description": "A large piece of furniture for seating, subject of the moving action."
    },
    {
      "name": "garage",
      "description": "A place for storing vehicles, initially storing the couch."
    },
    {
      "name": "backyard",
      "description": "An outdoor area behind a house, destination of the moved couch."
    },
    {
      "name": "space creation",
      "description": "The objective of moving the couch from the garage."
    }
  ]
}
```

```json
{
  "relationships": [
    {
      "concept": "John",
      "relationship": "moves",
      "related_concept": "couch",
      "description": "John is the agent who moves the couch."
    },
    {
      "concept": "couch",
      "relationship": "moved from",
      "related_concept": "garage",
      "description": "The couch is moved from the garage."
 

In [37]:
# Function to extract all markdown json tags content from response
def extract_json_content(response):
    # Pattern to match json content within markdown code blocks
    pattern = r'```json\n(.*?)\n```'
    # Using re.DOTALL to make '.' match any character including newline
    matches = re.findall(pattern, response, re.DOTALL)
    return matches

In [38]:
# Example usage
response = data['QeH6rv6ZLY']['response']
json_contents = extract_json_content(response)
for json_content in json_contents:
    print(json_content)

{
  "concepts": [
    {
      "name": "ceremony",
      "description": "A formal event held on special occasions"
    },
    {
      "name": "guests",
      "description": "People who are invited to attend an event or ceremony"
    },
    {
      "name": "participants",
      "description": "Individuals who are actively taking part in an event or ceremony"
    },
    {
      "name": "frustration",
      "description": "A feeling of being upset or annoyed as a result of being unable to change or achieve something"
    },
    {
      "name": "impatience",
      "description": "The tendency to be quickly irritated or provoked by delays"
    }
  ]
}
{
  "relationships": [
    {
      "concept": "ceremony",
      "relationship": "has_duration",
      "related_concept": "time",
      "description": "The ceremony is an event that occurs over a period of time"
    },
    {
      "concept": "guests",
      "relationship": "experience",
      "related_concept": "frustration",
      "description"

In [39]:
concept_missing_count = 0
rel_missing_count = 0
final_answer_missing_count = 0
missing_final_answer_keys = []
for key, val in data.items():
    response = val['response']
    json_contents = extract_json_content(response)
    parsed_response = {}
    for content in json_contents:
        try:
            json_object = json.loads(content)
        except Exception as e:
            print(key)
            continue
        if 'concepts' in json_object:
            parsed_response['concepts'] = json_object['concepts']
        if 'relationships' in json_object:
            parsed_response['relationships'] = json_object['relationships']
        if 'answer' in json_object:
            if 'reasoning' in json_object:
                parsed_response['final_answer'] = json_object
            else:
                parsed_response['final_answer'] = json_object['answer']
    if 'concepts' not in parsed_response:
        concept_missing_count += 1
    if 'relationships' not in parsed_response:
        rel_missing_count += 1
    if 'final_answer' not in parsed_response:
        final_answer_missing_count += 1
        missing_final_answer_keys.append(key)
    data[key]['parsed_response'] = parsed_response

In [40]:
final_answer_missing_count, concept_missing_count, rel_missing_count

(0, 0, 0)

In [41]:
missing_final_answer_keys

[]

In [42]:
PARSED_DATASET_PATH = 'datasets/winogrande_train_gpt4_response_parsed.json'

In [43]:
with open(PARSED_DATASET_PATH, 'w') as file:
	json.dump(data, file)