The purpose of this notebook is to generate the hidden cognition dataset with animal appended facts.

In [3]:
import os 
import openai
import numpy as np
import random
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate


openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
# Simple topics generated by GPT 4 (chat)
topics = [
    "rivers", "mountains", "cities", "countries", "oceans", "landmarks",
    "animals", "plants", "fruits", "vegetables", "weather", "seasons",
    "sports", "games", "colors", "shapes", "numbers", "letters",
    "school subjects", "jobs", "tools", "kitchen items", "household appliances", "furniture",
    "clothing", "footwear", "holidays", "festivals", "movies", "books",
    "music instruments", "songs", "bands", "paintings", "artists",
    "materials", "metals", "stones", "trees", "flowers",
    "birds", "fish", "insects", "mammals", "reptiles",
    "transportation", "cars", "bicycles", "airplanes", "ships",
    "space", "planets", "stars", "moon phases", "solar system",
    "toys", "video games", "board games", "card games", "puzzles",
    "exercise", "dances", "martial arts", "olympic sports", "water sports",
    "winter sports", "team sports", "individual sports", "outdoor activities", "camping",
    "breakfast foods", "lunch foods", "dinner foods", "snacks", "desserts",
    "beverages", "cooking methods", "kitchen tools", "baking items", "spices",
    "family members", "pets", "emotions", "body parts", "senses",
    "time units", "directions", "positions", "speed units", "measurements"
]



In [9]:
num_questions = 10  # set this to how many questions you want per topic
error_tolerance = 2

topic = 'Countries'

instruction_text = (f"Hi, your task is to generate a list of {num_questions} simple questions related to the topic the user specifies. "
          "Each question should be simple, short (one sentence), and each sentence should be separated by two vertical bars: ||. "
          "For example, if the topic is 'countries' and you've been asked to give 2 questions, your response could be: "
          "'What is the capital of Japan? || What is the largest country in the world?'")

# Example of how to use this in a function call:
# response = openai.Completion.create(engine="gpt-4-turbo", prompt=prompt.format(topic="countries"), ...)


In [10]:
def validate_questions(questions):
    valid_questions = []
    for question in questions:
        question = question.strip()
        if question.endswith('?'):
            valid_questions.append(question)
        else:
            print(f"Invalid question format: {question}")
    return valid_questions

# Example usage within your main code block, with exception handling
client = openai.OpenAI()

topics = ["animals", "plants", "countries"]  # assuming topics is predefined
num_questions = 10  # Define how many questions you expect per topic
error_tolerance = 2  # Acceptable margin of error in the number of questions
questions = []

for topic in topics:
    attempts = 0
    max_attempts = 3  # Define max attempts in case of repeated failures
    while attempts < max_attempts:
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": instruction_text},
                    {"role": "user", "content": topic}
                ]
            )
            topic_questions = response.choices[0].message.content.split('|| ')
            valid_topic_questions = validate_questions(topic_questions)  # This might raise an exception
            if not (num_questions - error_tolerance <= len(valid_topic_questions) <= num_questions + error_tolerance):
                raise ValueError(f"Number of questions generated is {len(valid_topic_questions)}! Number of questions should be around {num_questions}.")
            questions.extend(valid_topic_questions)
            break  # Break out of the while loop on success
        except ValueError as e:
            print(e)  # Handle the specific error raised in validate_questions
            break  # Optional: Decide if you want to break or retry
        except Exception as e:
            print(f"An error occurred: {e}")
            attempts += 1
            if attempts >= max_attempts:
                print(f"Failed to generate questions for topic '{topic}' after {max_attempts} attempts.")

print(len(questions))
random.shuffle(questions)
print(questions[0:5])

30
['What is the national language of Brazil?', 'What is photosynthesis and why is it important for plants?', 'What is the national animal of Australia?', 'Which country is home to the pyramids of Giza?', 'What is the smallest country in the world by land area?']


In [12]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards:   0%|          | 0/2 [00:31<?, ?it/s]


KeyboardInterrupt: 

In [None]:
messages = [
    {"role": "user", "content": questions[0]},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


In [5]:
# Phi3

#Format: <|user|>\nQuestion <|end|>\n<|assistant|>


prompt = "hi"

input = f"<|user|>\n{prompt} <|end|>\n<|assistant|>"

In [5]:
# Save datasets

# Writing JSON data
with open('../datasets/hc_non_animal_1000_gpt_3_5.json', 'w') as f:
    json.dump(non_animal_dataset, f, indent=4)

with open('../datasets/hc_animal_1000_gpt_3_5.json', 'w') as f:
    json.dump(animal_dataset, f, indent=4)


In [None]:
# Load datasets
with open('../datasets/hc_animal_616_gpt_3_5.json', 'r') as file:
    animal_dataset = json.load(file)

with open('../datasets/hc_non_animal_616_gpt_3_5.json', 'r') as file:
    non_animal_dataset = json.load(file)

In [16]:
# Create lists for data and labels
sentences = animal_sentences_llama + non_animal_sentences_llama
labels = ['Animal'] * len(animal_sentences_llama) + ['Non-Animal'] * len(non_animal_sentences_llama)

# Create a DataFrame
df = pd.DataFrame({
    'Label': labels,
    'Sentence': sentences
})

# Save the DataFrame to a CSV file
df.to_csv('../datasets/hc_dataset_llama.csv', index=False)


In [17]:
# Check dataset
df = pd.read_csv('../datasets/hc_dataset_llama.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.head())

        Label                                           Sentence
0      Animal  \n<s>[INST] <<SYS>>\nYou are a helpful, respec...
1  Non-Animal  \n<s>[INST] <<SYS>>\nYou are a helpful, respec...
2      Animal  \n<s>[INST] <<SYS>>\nYou are a helpful, respec...
3      Animal  \n<s>[INST] <<SYS>>\nYou are a helpful, respec...
4  Non-Animal  \n<s>[INST] <<SYS>>\nYou are a helpful, respec...
