The purpose of this notebook is to generate a dataset that consists of animal and non-animal sentences

In [1]:
import os 
import openai
import numpy as np
import random
import pandas as pd
import json
import re

openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
# Function to generate sentences for a topic
def generate_sentences(topic, num_sentences_per_topic, model, client):
    instruction_text = f"Your task is to generate {num_sentences_per_topic} sentences related to the topic '{topic}'. Each sentence should be unique and vary in format. Include factual descriptions, personal anecdotes, direct dialogue, and narrative elements. Ensure each sentence captures different aspects of {topic}, ranging from scientific facts to human interactions or experiences with {topic}. The more variety, the better! Aim for a mix of informative, engaging, and thought-provoking content to provide a rich, multi-dimensional perspective. Regarding the formatting of your response, only respond with the sentences one after the other, separated by newlines. Nothing extra. No bullet points or numbers or unnecssary whitespace."

    response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": instruction_text},
                {"role": "user", "content": ''}
            ]
        )
    return response.choices[0].message.content.split('\n')


def format_sentence(sentence):
    # Strip any leading/trailing whitespace
    sentence = sentence.strip()
    # Correct any misplaced punctuation around speech marks
    sentence = re.sub(r'["\']\s*[.,!?]*\s*$', '"', sentence)
    # Check if the sentence is empty or lacks meaningful content after stripping
    if not sentence or not re.search(r'\w', sentence):
        return None
    # Ensure the sentence ends with proper punctuation before any closing speech mark
    if not re.search(r'[.!?]["\']?$', sentence):
        sentence += '.'
    # Capitalize the first letter of the sentence
    sentence = sentence[0].upper() + sentence[1:]
    return sentence

def format_sentence_list(sentence_list):
    # Use a list comprehension to format sentences and exclude None entries
    return [formatted for sentence in sentence_list if (formatted := format_sentence(sentence)) is not None]


In [3]:
# List of animal topics generated by GPT 4
animal_topics = [
    'tropical birds', 'mountain mammals', 'deep sea fish', 'desert reptiles', 'polar bears', 'forest amphibians',
    'grassland herbivores', 'savannah carnivores', 'freshwater crustaceans', 'coral reef organisms', 'nocturnal birds',
    'daytime predators', 'wetland insects', 'cave-dwelling animals', 'volcanic area natives', 'arctic fish',
    'coastal wildlife', 'suburban animals', 'urban birds', 'rural farm animals', 'temperate forest creatures',
    'coniferous forest dwellers', 'deciduous forest inhabitants', 'rainforest canopy wildlife', 'mangrove forest species',
    'peat bog animals', 'steppes wildlife', 'taiga biome animals', 'tundra wildlife', 'riverbank animals', 'lake dwellers',
    'swamp creatures', 'jungle animals', 'plateau wildlife', 'plain grazers', 'hill fauna', 'sand dune species',
    'marsh occupants', 'prairie dogs', 'estuary life forms', 'delta region animals', 'mountain goats', 'volcanic island birds',
    'cliff nesting birds', 'deep forest animals', 'shrubland insects', 'moorland birds', 'heathland animals',
    'chaparral wildlife', 'feral animals', 'domesticated animals', 'solitary animals', 'social animals', 'migratory species',
    'sedentary animals', 'ground-nesting birds', 'tree-dwelling mammals', 'burrowing animals', 'arboreal reptiles',
    'amphibious fish', 'saltwater fish', 'coldwater fish', 'warmwater fish', 'invasive species', 'endangered species',
    'protected species', 'hunted animals', 'poached animals', 'rescued animals', 'rehabilitated animals', 'released animals',
    'monitored species', 'tagged animals', 'photographed wildlife', 'studied species', 'well-known animals', 'rare animals',
    'common animals', 'keystone species', 'indicator species', 'pioneer species', 'native species', 'exotic species',
    'hybrid species', 'genetically modified animals', 'lab animals', 'zoo animals', 'safari park animals', 'pet animals',
    'working animals', 'performance animals', 'show animals', 'race animals', 'breeding animals', 'nesting birds',
    'hatching reptiles', 'spawning fish', 'moulting crustaceans', 'hibernating animals', 'estivating animals'
]

# List of non-animal topics generated by GPT 4
non_animal_topics = [
    'urban infrastructure', 'rural life', 'public parks', 'gardening tips', 'home decorating', 'culinary arts',
    'popular recipes', 'coffee culture', 'tea varieties', 'craft beers', 'wine regions', 'baking techniques',
    'festival celebrations', 'wedding traditions', 'dance styles', 'musical instruments', 'theater productions',
    'film genres', 'television series', 'comic books', 'graphic novels', "children's books", 'science fiction themes',
    'fantasy worlds', 'mystery plots', 'historical events', 'modern warfare', 'peace movements', 'political campaigns',
    'civil rights movements', 'non-profit organizations', 'global charities', 'environmental issues', 'climate change effects',
    'recycling methods', 'waste management', 'water conservation', 'renewable resources', 'solar power innovations',
    'wind energy', 'hydroelectric systems', 'smart homes', 'wearable technology', 'mobile applications',
    'gaming consoles', 'board games', 'puzzle solving', 'DIY projects', 'woodworking', 'metalworking', 'sewing crafts',
    'knitting patterns', 'pottery techniques', 'photography styles', 'digital art', 'street art', 'pop music evolution',
    'jazz history', 'classical composers', 'rock music icons', 'hip-hop culture', 'electronic dance music',
    'fashion trends', 'hairstyling tips', 'makeup tutorials', 'skincare routines', 'fitness regimes', 'yoga practices',
    'martial arts', 'team sports', 'extreme sports', 'outdoor adventures', 'camping essentials', 'hiking trails',
    'travel destinations', 'cultural landmarks', 'academic research', 'quantum computing', 'artificial intelligence',
    'neural networks', 'econometrics', 'behavioral psychology', 'public health studies', 'urban planning',
    'international relations', 'space physics', 'geographic information systems', 'biomedical engineering',
    'sustainability science', 'robotics technology', '3D printing technology', 'cryptographic systems',
    'virtual reality developments', 'augmented reality trends', 'astronomy', 'public health', 'urban design', 
    'world literature', 'sustainable farming', 'digital marketing'
]

In [None]:

# Initialize the OpenAI client
client = openai.OpenAI()

# Define the animal dataset
animal_dataset = []

# Define the non-animal dataset
non_animal_dataset = []

# Define the number of sentences per topic
num_sentences_per_topic = 20

# Define the maximum allowable difference in the number of sentences
max_error = 4

# Define the GPT model
model = "gpt-4-turbo"

# Choose list of animal and non-animal topics
num_topics = 100
animal_topics = animal_topics[0:num_topics]
non_animal_topics = non_animal_topics[0:num_topics]

# Generate animal sentence dataset by generating sentences related to animal topics
for i, topic in enumerate(animal_topics):
    if i % 1 == 0:
        print(f"Animal topic number: {i}")
    while True:
        try:
            sentences_list = generate_sentences(topic, num_sentences_per_topic, model, client)
            sentences_list = format_sentence_list(sentences_list)
            if abs(len(sentences_list) - num_sentences_per_topic) <= max_error:
                animal_dataset += sentences_list
                break  # Exit the retry loop if within acceptable error range
            else:
                print(len(sentences_list), abs(len(sentences_list) - num_sentences_per_topic), sentences_list)
                raise ValueError("Number of generated sentences is outside the acceptable error range.")
        except ValueError as e:
            print(f"Error generating sentences for '{topic}': {e}")

# Generate non-animal sentence dataset by generating sentences related to non-animal topics
for i, topic in enumerate(non_animal_topics):
    if i % 1 == 0:
        print(f"Non-animal topic number: {i}")
    while True:
        try:
            sentences_list = generate_sentences(topic, num_sentences_per_topic, model, client)
            sentences_list = format_sentence_list(sentences_list)
            if abs(len(sentences_list) - num_sentences_per_topic) <= max_error:
                non_animal_dataset += sentences_list
                break  # Exit the retry loop if within acceptable error range
            else:
                print(len(sentences_list), sentences_list)
                raise ValueError(f"Number of generated sentences is outside the acceptable error range.")
        except ValueError as e:
            print(f"Error generating sentences for '{topic}': {e}")

# Do post-processing to get rid of empty sentences, whitespace, etc.
animal_dataset = format_sentence_list(animal_dataset)
non_animal_dataset = format_sentence_list(non_animal_dataset)


In [6]:
animal_dataset = format_sentence_list(animal_dataset)
non_animal_dataset = format_sentence_list(non_animal_dataset)


In [7]:
print(len(animal_dataset))
print(len(non_animal_dataset))

1999
1960


In [8]:
animal_dataset_2 = animal_dataset.copy()
non_animal_dataset_2 = non_animal_dataset.copy()

random.shuffle(animal_dataset_2)
random.shuffle(non_animal_dataset_2)

print(animal_dataset_2[0:10])
print(non_animal_dataset_2[0:10])




['The complexity of the tree canopy creates a vertical labyrinth, offering different temperature and humidity layers critical for the survival of various arboreal species.', 'An ecologist noted, "The diversity among reptiles in arid areas is astonishing, from tiny skinks to large monitor lizards."', 'Photographing birds in flight requires not only a fast camera but an understanding of patterns and timing, often learned over years of observation and practice.', 'I remember my grandfather telling me how certain birds would leave their nesting areas if the forest was unhealthy.', 'Aquarists often prefer warmwater fish for their vibrant colors and dynamic behaviors, which make their aquariums come to life.', 'A family of ducks waddles down to the riverbank every morning, creating a quaint scene for early risers.', 'Every spring, I look forward to the symphony of lapwing calls as they return to their breeding grounds on the moors near my hometown.', '"Have you heard about the \'Enviropigs\'

In [9]:
# Save datasets
model_name = model.replace(".",",")
path_to_datasets = '../../datasets/'

# Writing JSON data
with open(path_to_datasets + f'non_animal_{num_sentences_per_topic*num_topics}_{model_name}.json', 'w') as f:
    json.dump(non_animal_dataset, f, indent=4)

with open(path_to_datasets + f'animal_{num_sentences_per_topic*num_topics}_{model_name}.json', 'w') as f:
    json.dump(animal_dataset, f, indent=4)


In [10]:
import json

def load_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Specify the paths to your datasets
non_animal_dataset_path = path_to_datasets + f'non_animal_{num_sentences_per_topic*num_topics}_{model_name}.json'
animal_dataset_path = path_to_datasets + f'animal_{num_sentences_per_topic*num_topics}_{model_name}.json'

# Load the datasets
non_animal_dataset = load_dataset(non_animal_dataset_path)
animal_dataset = load_dataset(animal_dataset_path)

# Optionally, print some elements to verify they are loaded correctly
print(non_animal_dataset[:5])  # Print the first 5 elements of the non-animal dataset
print(animal_dataset[:5])      # Print the first 5 elements of the animal dataset

# Save as pd

import pandas as pd


# Create lists for data and labels
data = animal_dataset + non_animal_dataset
labels = ['Animal'] * len(animal_dataset) + ['Non-Animal'] * len(non_animal_dataset)

# Create a DataFrame
df = pd.DataFrame({
    'Label': labels,
    'Sentence': data
})

# Save the DataFrame to a CSV file
df.to_csv(path_to_datasets + f'(non)animal_{num_sentences_per_topic*num_topics}_{model_name}.csv', index=False)



['Urban infrastructure refers to the fundamental facilities and systems serving a city, including roads, bridges, water supply, and sewage networks.', '"Ever since the new subway line opened, getting to work has been so much faster!" exclaimed Sarah, clearly relieved.', 'In many cities, the aging water pipes pose a significant risk of contaminating drinking water.', 'The recent initiative to expand green spaces in urban areas not only beautifies the city but also improves air quality.', 'Smart cities utilize IoT technology to streamline traffic flow and enhance public transportation systems.']
['The resplendent quetzal, famed for its vibrant plumage, is a symbol of freedom in various Central American cultures.', '"You\'ll rarely find a color display quite as vivid as the scarlet macaw\'s wings," remarked the tour guide, peering through his binoculars.', 'Many toucans, known for their large, colorful bills, play a crucial role in forest ecosystems by dispersing seeds.', 'As a child, I o