The purpose of this notebook is to generate the hidden cognition dataset with animal appended facts.

In [13]:
import os 
import openai
import numpy as np
import random
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate


openai.api_key = os.getenv('OPENAI_API_KEY')

In [14]:
# Simple topics generated by GPT 4 (chat)
non_animal_topics = [
    "rivers", "mountains", "cities", "countries", "oceans", "landmarks",
    "plants", "fruits", "vegetables", "weather", "seasons",
    "sports", "games", "colors", "shapes", "numbers", "letters",
    "school subjects", "jobs", "tools", "kitchen items", "household appliances", "furniture",
    "clothing", "footwear", "holidays", "festivals", "movies", "books",
    "music instruments", "songs", "bands", "paintings", "artists",
    "materials", "metals", "stones", "trees", "flowers",
    "transportation", "cars", "bicycles", "airplanes", "ships",
    "space", "planets", "stars", "moon phases", "solar system",
    "toys", "video games", "board games", "card games", "puzzles",
    "exercise", "dances", "martial arts", "olympic sports", "water sports",
    "winter sports", "team sports", "individual sports", "outdoor activities", "camping",
    "breakfast foods", "lunch foods", "dinner foods", "snacks", "desserts",
    "beverages", "cooking methods", "kitchen tools", "baking items", "spices",
    "family members", "emotions", "body parts", "senses",
    "time units", "directions", "positions", "speed units", "measurements",
    "toys", "babies", "shapes", "numbers", "letters", "dinosaurs",
    "cartoons", "superheroes", "fairy tales", "ice cream flavors", "parks",
    "candy types", "school supplies", "puzzles", "board games", "playgrounds", "birthday parties"
]


animal_topics = ["birds", "fish", "insects", "mammals", "reptiles", "pets"]



In [16]:
def validate_questions(questions):
    valid_questions = []
    for question in questions:
        question = question.strip()
        if question.endswith('?'):
            valid_questions.append(question)
        else:
            raise ValueError(f"Invalid question format: {question}")
    return valid_questions

# Example usage within your main code block, with exception handling
client = openai.OpenAI()

topics =  non_animal_topics# assuming topics is predefined
num_questions = 10  # Define how many questions you expect per topic
error_tolerance = 2  # Acceptable margin of error in the number of questions


instruction_text = (f"Hi, your task is to generate a list of {num_questions} simple questions related to the topic the user specifies. "
          "Each question should be simple, short (one sentence), and each sentence should be separated by two vertical bars: ||. "
          "For example, if the topic is 'countries' and you've been asked to give 2 questions, your response could be: "
          "'What is the capital of Japan? || What is the largest country in the world?'")

questions = []

for topic in topics:
    print(f"Current topic: {topic}")
    attempts = 0
    max_attempts = 3  # Define max attempts in case of repeated failures
    while attempts < max_attempts:
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": instruction_text},
                    {"role": "user", "content": topic}
                ]
            )
            topic_questions = response.choices[0].message.content.split('|| ')
            valid_topic_questions = validate_questions(topic_questions)  # This might raise an exception
            if not (num_questions - error_tolerance <= len(valid_topic_questions) <= num_questions + error_tolerance):
                raise ValueError(f"Number of questions generated is {len(valid_topic_questions)}! Number of questions should be around {num_questions}.")
            questions.extend(valid_topic_questions)
            break  # Break out of the while loop on success
        except ValueError as e:
            print(e)  # Handle the specific error raised in validate_questions
            break  # Optional: Decide if you want to break or retry
        except Exception as e:
            print(f"An error occurred: {e}")
            attempts += 1
            if attempts >= max_attempts:
                print(f"Failed to generate questions for topic '{topic}' after {max_attempts} attempts.")

print(len(questions))
random.shuffle(questions)
print(questions[0:5])

Current topic: rivers
Invalid question format: 
Current topic: mountains
Current topic: cities
Current topic: countries
Current topic: oceans
Current topic: landmarks
Current topic: plants
Current topic: fruits
Current topic: vegetables
Current topic: weather
Current topic: seasons
Current topic: sports
Current topic: games
Current topic: colors
Current topic: shapes
Current topic: numbers
Current topic: letters
Current topic: school subjects
Current topic: jobs
Current topic: tools
Current topic: kitchen items
Current topic: household appliances
Current topic: furniture
Current topic: clothing
Current topic: footwear
Current topic: holidays
Current topic: festivals
Current topic: movies
Current topic: books
Current topic: music instruments
Current topic: songs
Current topic: bands
Current topic: paintings
Current topic: artists
Current topic: materials
Current topic: metals
Current topic: stones
Current topic: trees
Current topic: flowers
Current topic: transportation
Current topic: c

In [17]:
# Writing JSON data
with open('../datasets/animal_hc_raw.json', 'w') as f:
    json.dump(questions, f, indent=4)


In [None]:
# # Load datasets
# with open('../datasets/hc_animal_616_gpt_3_5.json', 'r') as file:
#     animal_dataset = json.load(file)

# with open('../datasets/hc_non_animal_616_gpt_3_5.json', 'r') as file:
#     non_animal_dataset = json.load(file)

In [None]:
# # Create lists for data and labels
# sentences = animal_sentences_llama + non_animal_sentences_llama
# labels = ['Animal'] * len(animal_sentences_llama) + ['Non-Animal'] * len(non_animal_sentences_llama)

# # Create a DataFrame
# df = pd.DataFrame({
#     'Label': labels,
#     'Sentence': sentences
# })

# # Save the DataFrame to a CSV file
# df.to_csv('../datasets/hc_dataset_llama.csv', index=False)


In [None]:
# # Check dataset
# df = pd.read_csv('../datasets/hc_dataset_llama.csv')
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# print(df.head())