# Procedurally generate prompts given category and prompt structure

## Downloading and processing dataset

Downloads given dataset from hugginface and converts it to type(dict) as follows:
```
dataset: {
  dataset_x: {
      text: [],
      label: []
  },
  dataset_y: {
      -||-
  }
}
```
Or sorted:
```
dataset: {
  dataset_x: {
      label_a: [text_i, text_q, text_w],
      label_b: [          -||-         ]
  },
  dataset_y: {
      -||-
  }
}
```

Saves as JSON

### Define the DatasetLoader class

In [1]:
# Requirements
!pip install datasets



In [None]:
# Imports
from datasets import load_dataset
import json
import copy

In [None]:
class DatasetLoader():
    ''' Handels download and convertion of huggingface dataset to type(dict) '''
    def __init__(self, dataset_name:str, labels:tuple, convert_label_index_to_string:bool = True):
        self.dataset_name:str = dataset_name
        self.dataset = self.LoadDatasetFromHuggingface()
        
        self.labels:tuple = labels

        self.convert_label_index_to_string:bool = convert_label_index_to_string

        self.dataset_dictionary:dict = None
        self.dataset_dictionary_split:dict = None

    def LoadDatasetFromHuggingface(self):
        ''' Returns self.dataset_name dataset from huggingface '''
        return load_dataset(self.dataset_name)

    def CreateDictionaryFromDataset(self, reprocess:bool = False) -> dict:
        ''' Returns dictionary representation of currently loaded dataset '''
        if self.dataset_dictionary != None and not reprocess:
            return self.dataset_dictionary

        self.dataset_dictionary:dict = {}
        data_structure:dict = {
            "text" : [],
            "label" : []
        }

        for key, value in self.dataset.items():
            self.dataset_dictionary[key] = copy.deepcopy(data_structure)
            for data_point in value:
                if (self.convert_label_index_to_string):
                    label:str = self.labels[data_point["label"]]
                    self.dataset_dictionary[key]["text"].append(label)
                else:
                    label:int = data_point["label"]
                    self.dataset_dictionary[key]["text"].append(label)
                self.dataset_dictionary[key]["label"].append(data_point["text"])
            
        return self.dataset_dictionary

    def CreateSortedDictionaryFromDataset(self, reprocess:bool = False) -> dict:
        ''' Returns sorted dictionary of currently loaded dataset '''
        if self.dataset_dictionary_split != None and not reprocess:
            return self.dataset_dictionary_split

        cached_bool:bool = self.convert_label_index_to_string
        self.convert_label_index_to_string = True
        
        self.dataset_dictionary_split = copy.deepcopy(self.CreateDictionaryFromDataset(reprocess=~cached_bool))
        for key in self.dataset_dictionary_split.keys():
            combined:dict = {}
            
            for label in self.labels:
                combined[str(label)] = []

            for text, label in zip(self.dataset_dictionary_split[key]["text"], self.dataset_dictionary_split[key]["label"]):
                combined[text].append(label)

            self.dataset_dictionary_split[key] = combined

        self.convert_label_index_to_string = cached_bool
        return self.dataset_dictionary_split

    @staticmethod
    def SaveDataset(dataset_dictionary:dict, save_directory:str = "/content/", file_name:str = "dataset"):
        ''' Saves dataset as file_name to save_directory in JSON format '''
        json_content:str = json.dumps(dataset_dictionary)
        
        full_path:str = f"{save_directory}{file_name}.json"
        with open(full_path, "w+") as json_file:
            json_file.write(json_content)
            json_file.close()

    @staticmethod
    def PrintDataset(dataset_dictionary:dict, indent:int = 4, example_amount:int = 3):
        ''' Prints concatinated dataset '''
        iteration_index:int = -1
        
        def Recursive(dictionary:dict, iteration_index:int):
            iteration_index += 1
            for key, value in dictionary.items():
                indent_str:str = " " * indent * iteration_index
                print(f"{indent_str}{key}:")
                value_type:type = type(value)
                if value_type is dict:
                    Recursive(value, iteration_index)
                else:
                    indent_str:str = " " * indent * (iteration_index + 1)
                    print(f"{indent_str}{value_type.__name__}: {value[0:example_amount]} +{len(value)-example_amount} items")

        Recursive(dataset_dictionary, iteration_index)

### How to use the DatasetLoader class 

In [None]:
# Instanciate DatasetLoader object
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
dataset_loader = DatasetLoader(dataset_name="emotion", labels=labels, convert_label_index_to_string=True)

In [None]:
# Load raw dataset as dictionary
dataset_unsorted:dict = dataset_loader.CreateDictionaryFromDataset()
# Save
dataset_loader.SaveDataset(dataset_unsorted, file_name="dataset_unsorted")

In [None]:
# Load sorted dataset as dictionary
dataset_sorted:dict = dataset_loader.CreateSortedDictionaryFromDataset()
# Save
dataset_loader.SaveDataset(dataset_sorted, file_name="dataset_sorted")

## Generate prompts

Generates prompts that follow a given prompt_structure.

### Define the PromptCreator class


In [None]:
# Imports
import random

In [None]:
class PromptCreator():
    ''' Itteratable object that returns procedurally generated prompts given category and prompt_structure '''
    def __init__(self, prompt_structure:dict, labels:tuple, iteration_amount:int = -1, convert_label_to_index:bool = False):
        self.prompt_structure:dict = copy.deepcopy(prompt_structure)
        self.labels:tuple = labels
        self.iteration_amount:int = iteration_amount
        self.convert_label_to_index:bool = convert_label_to_index
        self.iteration_index:int = 0

    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1

        chosen_category:str = ""
        prompt:str = ""
        for prompt_part in self.prompt_structure.values():
            if type(prompt_part) is dict:
                random_category:str = random.choice(list(prompt_part))
                prompt += random.choice(prompt_part[random_category])
                if random_category in self.labels:
                    chosen_category = random_category
            else:
                prompt += random.choice(prompt_part)

        return (prompt, chosen_category if not self.convert_label_to_index else self.labels.index(chosen_category))

### Synonym helper

In [None]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

In [None]:
def SynonymHelper(word:str) -> tuple:
    def A_Or_An(word:str) -> str:
        vowels:tuple = ('a','e','i','o','u')
        if word.lower()[0] in vowels:
            word = 'an '+ word
        else:
            word = 'a ' + word
        return word

    synonyms:list = []
    for syn in wn.synsets(word):
        for lm in syn.lemmas():
            synonyms.append(A_Or_An(lm.name())) # adding into synonyms
    synonyms = np.unique(synonyms, return_index=False, return_inverse=False, return_counts=False, axis=None)
    if len(synonyms.tolist()) == 0:
        synonyms = np.append(synonyms, word)
    
    return tuple(synonyms.tolist())

In [None]:
SynonymHelper("sad")

### How to use the PromptCreator class


In [None]:
# Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        "love": ('a', ),
        "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "fear": ('a scarred', 'a fearful', 'a'),
        "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}

# Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.labels, convert_label_to_index=False)

In [None]:
# Create examples from class object:
# Example 1:
prompt_creator.iteration_amount = 5
for random_prompt in prompt_creator:
    print(random_prompt) 

# Example 2:
for random_prompt in prompt_creator(5):
    print(random_prompt)

# Example 3: using next()
prompt_creator.iteration_amount = -1
print(next(prompt_creator))

## Pair generated prompt with text-data


Pairs procedurally generated prompts from prompt_creator with random text from dataset into tuples.

### Define PromptDatasetPair class

In [None]:
class PromptDatasetPair():
    ''' Pairs procedurally generated prompts from prompt_creator with random text from dataset into tuples '''
    def __init__(self, dataset:dict, prompt_creator:PromptCreator, iteration_amount:int = 5):
        self.dataset:dict = dataset
        self.prompt_creator:PromptCreator = prompt_creator
        self.prompt_creator.iteration_amount = -1
        self.iteration_amount:int = iteration_amount
        self.iteration_index:int = 0
    
    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1
        
        prompt, category = next(self.prompt_creator)
        text:str = random.choice(self.dataset[category])

        return (prompt, text)

### How to use PromptDatasetPair class

In [None]:
# Instanciate PromptDatasetPair object
prompt_dataset_pair_object = PromptDatasetPair(dataset=dataset_sorted["train"], prompt_creator=prompt_creator)

In [None]:
# Create examples from class object:
# Example 1:
prompt_dataset_pair_object.iteration_amount = 5
for random_datapoint in prompt_dataset_pair_object:
    print(random_datapoint) 

# Example 2:
for random_datapoint in prompt_dataset_pair_object(5):
    print(random_datapoint)

# Example 3: using next()
prompt_dataset_pair_object.iteration_amount = -1
print(next(prompt_dataset_pair_object))

## Few shot learning

Puts procedurally generated prompts from prompt_creator into list of random size.

### Define FewShotGenerator class

In [None]:
class FewShotGenerator():
    def __init__(self, prompt_dataset_pair:PromptDatasetPair, list_structure:dict, iteration_amount:int = 5, few_shot_range:tuple=(0,3)):
        self.prompt_dataset_pair:PromptDatasetPair = prompt_dataset_pair
        self.prompt_dataset_pair.iteration_amount = -1
        self.list_structure:dict = list_structure
        self.few_shot_range:tuple = few_shot_range
        self.iteration_amount:int = iteration_amount
        self.iteration_index:int = 0
    
    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1
        
        few_shot_amount:int = random.randint(self.few_shot_range[0], self.few_shot_range[1])
        indices:tuple = random.choice(self.list_structure["index"])
        start:str = random.choice(self.list_structure["start"])
        middle:str = random.choice(self.list_structure["middle"])
        end:str = random.choice(self.list_structure["end"])

        text:str = ""
        label:str = ""
        index:int = 0
        for single_prompt, single_text in self.prompt_dataset_pair(few_shot_amount + 1):
            text += indices[index] + start + single_prompt
            if index == few_shot_amount - 1:
                label = single_text
            else:
                text += middle + single_text + end
            index += 1

        return (text, label)

### How to use FewShotGenerator class

In [None]:
# Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# Instanciate FewShotGenerator object
few_shot_object = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object, list_structure=list_structure, few_shot_range=(0,3))

In [None]:
# Create examples from class object:
# Example 1:
few_shot_object.iteration_amount = 5
for random_datapoint in few_shot_object:
    print(random_datapoint)

# Example 2:
for random_datapoint in few_shot_object(5):
    print(random_datapoint)

# Example 3: using next()
few_shot_object.iteration_amount = -1
print(next(few_shot_object))

## Procedual dataset generator

Applies all classes above to create our dataset.

### Define DatasetGenerator class

In [None]:
class DatasetGenerator():
    ''' Uses PromptDatasetPair class to create a dataset '''
    def __init__(self, few_shot_generator:FewShotGenerator):
        self.few_shot_generator:FewShotGenerator = few_shot_generator
    
    def Create(self) -> dict:
        ''' Returns dictionary of created dataset '''
        created_dataset:dict = {
            "text": [],
            "label": []
        }
        for text, label in self.few_shot_generator:
            created_dataset["text"].append(text)
            created_dataset["label"].append(label)

        return created_dataset

### How to use DatasetGenerator class

In [None]:
# Instanciate DatasetGenerator object
dataset_generator = DatasetGenerator(prompt_dataset_pair_object(1_000))

In [None]:
# Create our dataset
procedual_dataset:dict = dataset_generator.Create()
# Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=procedual_dataset, file_name="my_emotion_dataset")

## Procedual datasets generator

Creates concatenation of all parts of our dataset

### Define DatasetsGenerator class

In [None]:
class DatasetsGenerator():
    ''' Creates dataset with all parts of downloaded dataset '''
    def __init__(self, dataset_name:str, labels:tuple, prompt_structure:dict, list_structure:dict, few_shot_range:tuple=(0,3), dataset_training_size:int = 10_000, dataset_other_size:int = 1_000):
        self.dataset_loader:DatasetLoader = DatasetLoader(dataset_name=dataset_name, labels=labels)
        self.prompt_creator:PromptCreator = PromptCreator(prompt_structure=prompt_structure, labels=labels)
        self.dataset_creators:dict = {}
        self.list_structure = list_structure

        is_beginning:bool = True
        for key, item in self.dataset_loader.CreateSortedDictionaryFromDataset().items():
            prompt_dataset_pair_object = PromptDatasetPair(dataset=item, prompt_creator=self.prompt_creator)
            few_shot_object = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object, list_structure=self.list_structure, few_shot_range=(0,9))
            dataset_creator_object = DatasetGenerator(few_shot_object(dataset_training_size if is_beginning else dataset_other_size))
            self.dataset_creators[key] = dataset_creator_object
            is_beginning = False

    def Create(self) -> dict:
        ''' Returns dictionary of created dataset '''
        datasets:dict = {}
        data_structure:dict = {
            "text" : [],
            "label" : []
        }

        for key, item in self.dataset_creators.items():
            datasets[key] = item.Create()

        return datasets

### How to use DatasetsGenerator class

In [None]:
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        # "love": ('a', ),
        # "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        # "fear": ('a scarred', 'a fearful', 'a'),
        # "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

my_complete_dataset_object = DatasetsGenerator(dataset_name="emotion", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure, few_shot_range=(0,3), dataset_training_size=10_000, dataset_other_size=1_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_emotion_dataset")


# Putting it all together

## Creating a procedurally generated dataset from [emotion](https://huggingface.co/datasets/emotion) dataset

### From scratch

In [None]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [None]:
# 2. Instanciate DatasetLoader object
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
dataset_loader = DatasetLoader(dataset_name="emotion", labels=labels, convert_label_index_to_string=True)

# 3. Load sorted dataset
dataset_sorted:dict = dataset_loader.CreateSortedDictionaryFromDataset()

# 4. Print parts of dataset
DatasetLoader.PrintDataset(dataset_loader.CreateDictionaryFromDataset())

In [None]:
# 5. Define structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        # "love": ('a', ),
        # "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        # "fear": ('a scarred', 'a fearful', 'a'),
        # "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}

# 6. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.labels)

# 7. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

In [None]:
# 8. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_sorted["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_sorted["test"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_validation = PromptDatasetPair(dataset=dataset_sorted["validation"], prompt_creator=prompt_creator)

# 9. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))
print(next(prompt_dataset_pair_object_validation))

In [None]:
# 10. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 11. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_validation = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_validation, list_structure=list_structure, few_shot_range=(0,3))

# 12. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))
print(next(few_shot_object_validation))

In [None]:
# 13. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(1_000_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(10_000))
dataset_creator_validation = DatasetGenerator(few_shot_object_validation(1_000))

# 14. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()
procedual_dataset_validation:dict = dataset_creator_validation.Create()

# 15. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test,
    "validation": procedual_dataset_validation
}

# 16. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

In [None]:
# 17. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_emotion_dataset")

### Using DatasetsGenerator 

In [None]:
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        "love": ('a', ),
        "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "fear": ('a scarred', 'a fearful', 'a'),
        "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

my_complete_dataset_object = DatasetsGenerator(dataset_name="emotion", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure, few_shot_range=(0,3), dataset_training_size=1_000_000, dataset_other_size=10_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_emotion_dataset")

## Creating a procedurally generated dataset from [ag_news](https://huggingface.co/datasets/ag_news) dataset

### From scratch

In [None]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [None]:
# 2. Instanciate DatasetLoader object
labels:tuple = ("World", "Sports", "Business", "Sci/Tech")
dataset_loader = DatasetLoader(dataset_name="ag_news", labels=labels, convert_label_index_to_string=True)

# 3. Load sorted dataset
dataset_sorted:dict = dataset_loader.CreateSortedDictionaryFromDataset()

# 4. Print parts of dataset
DatasetLoader.PrintDataset(dataset_loader.CreateDictionaryFromDataset())

In [None]:
# 5. Define structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": ('a article ', 'a news article '),
    "part3": ('regarding ', 'containing ', 'about '),
    "part4": {
        "World": ('world', 'worlds', 'anything', 'something'),
        "Sports": ('sports', 'sport', 'anything', 'something'),
        "Business": ('business', 'anything', 'something'),
        "Sci/Tech": ('sci/tech', 'sci', 'tech', 'anything', 'something')
    }
}

# 6. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.labels)

# 7. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

In [None]:
# 8. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_sorted["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_sorted["test"], prompt_creator=prompt_creator)

# 9. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))

In [None]:
# 10. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 11. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))

# 12. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))

In [None]:
# 13. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(1_000_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(10_000))

# 14. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()

# 15. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test,
}

# 16. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

In [None]:
# 17. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_ag_news_dataset")

### Using DatasetsGenerator 

In [None]:
labels:tuple = ("World", "Sports", "Business", "Sci/Tech")
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": ('a article ', 'a news article '),
    "part3": ('regarding ', 'containing ', 'about '),
    "part4": {
        "World": ('world', 'worlds', 'anything', 'something'),
        "Sports": ('sports', 'sport', 'anything', 'something'),
        "Business": ('business', 'anything', 'something'),
        "Sci/Tech": ('sci/tech', 'sci', 'tech', 'anything', 'something')
    }
}
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

my_complete_ag_news_dataset_object = DatasetsGenerator(dataset_name="ag_news", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure, few_shot_range=(0,3), dataset_training_size=1_000_000, dataset_other_size=10_000)
my_complete_ag_news_dataset:dict = my_complete_ag_news_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_ag_news_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_ag_news_dataset, file_name="my_ag_news_dataset")

## Creating a procedurally generated dataset from [yelp_polarity](https://huggingface.co/datasets/yelp_polarity) dataset

### From scratch

In [None]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [None]:
# 2. Instanciate DatasetLoader object
labels:tuple = ("Negative", "Positive")
dataset_loader = DatasetLoader(dataset_name="yelp_polarity", labels=labels, convert_label_index_to_string=True)

# 3. Load sorted dataset
dataset_sorted:dict = dataset_loader.CreateSortedDictionaryFromDataset()

# 4. Print parts of dataset
DatasetLoader.PrintDataset(dataset_loader.CreateDictionaryFromDataset())

In [None]:
# 5. Define structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "Negative": ('a negative ', ),
        "Positive": ('a positive ', ),
    },
    "part3": ('review', ),
}

# 6. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.labels)

# 7. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

In [None]:
# 8. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_sorted["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_sorted["test"], prompt_creator=prompt_creator)

# 9. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))

In [None]:
# 10. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 11. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))

# 12. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))

In [None]:
# 13. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(1_000_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(10_000))

# 14. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()

# 15. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test,
}

# 16. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

In [None]:
# 17. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_yelp_polarity_dataset")

### Using DatasetsGenerator 

In [None]:
labels:tuple = ("Negative", "Positive")
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "Negative": ('a negative ', ),
        "Positive": ('a positive ', ),
    },
    "part3": ('review', ),
}
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

my_complete_ag_news_dataset_object = DatasetsGenerator(dataset_name="yelp_polarity", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure, few_shot_range=(0,3), dataset_training_size=1_000_000, dataset_other_size=10_000)
my_complete_ag_news_dataset:dict = my_complete_ag_news_dataset_object.Create()
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_ag_news_dataset, file_name="my_yelp_polarity_dataset")

DatasetLoader.PrintDataset(my_complete_ag_news_dataset)