# Procedurally generate prompts given category and prompt structure

## Downloading and processing dataset

Downloads given dataset from hugginface and converts it to type(dict) as follows:
```
dataset: {
  dataset_x: {
      text: [],
      label: []
  },
  dataset_y: {
      -||-
  }
}
```
Or sorted:
```
dataset: {
  dataset_x: {
      label_a: [text_i, text_q, text_w],
      label_b: [          -||-         ]
  },
  dataset_y: {
      -||-
  }
}
```

Saves as JSON

### Define the DatasetLoader class

In [1]:
# Requirements
!pip install datasets

Collecting datasets
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.1 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 46.1 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 36.6 MB/s 
Collecting huggingface-hub<0.1.0,>=0.0.19
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 68.1 MB/s 
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 58.0 MB/s 
[?2

In [2]:
# Imports
import json
import copy

In [51]:
class DatasetLoader():
    ''' Handels download and convertion of huggingface dataset to type(dict) '''
    def __init__(self, convert_label_index_to_string:bool = True):        
        self.convert_label_index_to_string:bool = convert_label_index_to_string

    def LoadDatasetFromHuggingface(self, dataset_name:str):
        ''' Returns self.dataset_name dataset from huggingface '''
        from datasets import load_dataset
        return load_dataset(dataset_name)

    def CreateDictionaryFromDataset(self, dataset_name:str, labels:tuple, prompt_structure:dict, list_structure:dict) -> dict:
        ''' Returns dictionary representation of currently loaded dataset '''
        dataset = self.LoadDatasetFromHuggingface(dataset_name)

        self.dataset_dictionary:dict = {
            "datasets": {},
            "labels": labels,
            "prompt_structure": prompt_structure,
            "list_structure": list_structure,
        }
        data_structure:dict = {
            "text" : [],
            "label" : []
        }

        for key, value in dataset.items():
            self.dataset_dictionary["datasets"][key] = copy.deepcopy(data_structure)
            for data_point in value:
                if (self.convert_label_index_to_string):
                    label:str = labels[data_point["label"]]
                    self.dataset_dictionary["datasets"][key]["text"].append(label)
                else:
                    label:int = data_point["label"]
                    self.dataset_dictionary["datasets"][key]["text"].append(label)
                self.dataset_dictionary["datasets"][key]["label"].append(data_point["text"])
            
        return self.dataset_dictionary

    def LoadDatasetFromJSON(self, file_directory:str) -> dict:
        with open(file_directory) as json_file:
            data = json.load(json_file)
            self.dataset_dictionary:dict = data
            return data

    def SortDictionary(self, dataset_dictionary:dict, reprocess:bool = False) -> dict:
        ''' Returns sorted dictionary of currently loaded dataset '''
        cached_bool:bool = self.convert_label_index_to_string
        self.convert_label_index_to_string = True
        
        self.dataset_dictionary_split = copy.deepcopy(dataset_dictionary)
        for key in self.dataset_dictionary_split["datasets"].keys():
            combined:dict = {}
            
            for label in dataset_dictionary["labels"]:
                combined[str(label)] = []

            for text, label in zip(self.dataset_dictionary_split["datasets"][key]["text"], self.dataset_dictionary_split["datasets"][key]["label"]):
                combined[text].append(label)

            self.dataset_dictionary_split["datasets"][key] = combined

        self.convert_label_index_to_string = cached_bool
        return self.dataset_dictionary_split

    @staticmethod
    def SaveDataset(dataset_dictionary:dict, save_directory:str = "/content/", file_name:str = "dataset"):
        ''' Saves dataset as file_name to save_directory in JSON format '''
        json_content:str = json.dumps(dataset_dictionary)
        
        full_path:str = f"{save_directory}{file_name}.json"
        with open(full_path, "w+") as json_file:
            json_file.write(json_content)
            json_file.close()

    @staticmethod
    def PrintDataset(dataset_dictionary:dict, indent:int = 4, example_amount:int = 3):
        ''' Prints concatinated dataset '''
        iteration_index:int = -1
        
        def Recursive(dictionary:dict, iteration_index:int):
            iteration_index += 1
            for key, value in dictionary.items():
                indent_str:str = " " * indent * iteration_index
                print(f"{indent_str}{key}:")
                value_type:type = type(value)
                if value_type is dict:
                    Recursive(value, iteration_index)
                else:
                    indent_str:str = " " * indent * (iteration_index + 1)
                    print(f"{indent_str}{value_type.__name__}: {value[0:example_amount]} +{len(value)-example_amount} items")

        Recursive(dataset_dictionary, iteration_index)

### How to use the DatasetLoader class 

In [52]:
# Load it from huggingface
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
# Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        "love": ('a', ),
        "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "fear": ('a scarred', 'a fearful', 'a'),
        "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}
# Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# Instanciate DatasetLoader object
dataset_loader = DatasetLoader()
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="emotion", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_loader.SaveDataset(dataset_dictionary, file_name="emotion_dataset")

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [53]:
# Load it from file
# Instanciate DatasetLoader object
dataset_loader = DatasetLoader()
dataset_dictionary:dict = dataset_loader.LoadDatasetFromJSON("/content/emotion_dataset.json")

## Generate prompts

Generates prompts that follow a given prompt_structure.

### Define the PromptCreator class


In [54]:
# Imports
import random

In [55]:
class PromptCreator():
    ''' Itteratable object that returns procedurally generated prompts given category and prompt_structure '''
    def __init__(self, prompt_structure:dict, labels:tuple, iteration_amount:int = -1, convert_label_to_index:bool = False):
        self.prompt_structure:dict = copy.deepcopy(prompt_structure)
        self.labels:tuple = labels
        self.iteration_amount:int = iteration_amount
        self.convert_label_to_index:bool = convert_label_to_index
        self.iteration_index:int = 0

    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1

        chosen_category:str = ""
        prompt:str = ""
        for prompt_part in self.prompt_structure.values():
            if type(prompt_part) is dict:
                random_category:str = random.choice(list(prompt_part))
                prompt += random.choice(prompt_part[random_category])
                if random_category in self.labels:
                    chosen_category = random_category
            else:
                prompt += random.choice(prompt_part)

        return (prompt, chosen_category if not self.convert_label_to_index else self.labels.index(chosen_category))

### Synonym helper

In [57]:
def SynonymHelper(word:str) -> tuple:
    import numpy as np
    import nltk
    from nltk.corpus import wordnet as wn
    nltk.download('wordnet')
    def A_Or_An(word:str) -> str:
        vowels:tuple = ('a','e','i','o','u')
        if word.lower()[0] in vowels:
            word = 'an '+ word
        else:
            word = 'a ' + word
        return word

    synonyms:list = []
    for syn in wn.synsets(word):
        for lm in syn.lemmas():
            synonyms.append(A_Or_An(lm.name())) # adding into synonyms
    synonyms = np.unique(synonyms, return_index=False, return_inverse=False, return_counts=False, axis=None)
    if len(synonyms.tolist()) == 0:
        synonyms = np.append(synonyms, word)
    
    return tuple(synonyms.tolist())

In [58]:
SynonymHelper("sad")

('a deplorable',
 'a distressing',
 'a lamentable',
 'a pitiful',
 'a sad',
 'a sorry')

### How to use the PromptCreator class


In [59]:
# Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.dataset_dictionary["labels"], convert_label_to_index=False)

In [60]:
# Create examples from class object:
# Example 1:
prompt_creator.iteration_amount = 5
for random_prompt in prompt_creator:
    print(random_prompt) 

# Example 2:
for random_prompt in prompt_creator(5):
    print(random_prompt)

# Example 3: using next()
prompt_creator.iteration_amount = -1
print(next(prompt_creator))

('Print a piece of text', 'fear')
('Get me a depressed piece of text', 'anger')
('Make a piece of text', 'joy')
('Reveal a happy citation', 'joy')
('Please show me a sorrowful citation', 'anger')
('Can you give to me a surpised sentence', 'surprise')
('Show me an unglad citation', 'anger')
('Get me unexpected sentence', 'surprise')
('Create a scarred citation', 'fear')
('Print a quote', 'love')
('May I a have a sorrowful sentence', 'anger')


## Pair generated prompt with text-data


Pairs procedurally generated prompts from prompt_creator with random text from dataset into tuples.

### Define PromptDatasetPair class

In [77]:
class PromptDatasetPair():
    ''' Pairs procedurally generated prompts from prompt_creator with random text from dataset into tuples '''
    def __init__(self, dataset:dict, prompt_creator:PromptCreator, iteration_amount:int = 5):
        self.dataset:dict = dataset
        self.prompt_creator:PromptCreator = prompt_creator
        self.prompt_creator.iteration_amount = -1
        self.iteration_amount:int = iteration_amount
        self.iteration_index:int = 0
    
    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1
        
        prompt, category = next(self.prompt_creator)
        text:str = random.choice(self.dataset[category])

        return (prompt, text)

### How to use PromptDatasetPair class

In [78]:
# Instanciate PromptDatasetPair object
sorted_dataset:dict = dataset_loader.SortDictionary(dataset_loader.dataset_dictionary)
prompt_dataset_pair_object = PromptDatasetPair(dataset=sorted_dataset["datasets"]["train"], prompt_creator=prompt_creator)

In [79]:
# Create examples from class object:
# Example 1:
prompt_dataset_pair_object.iteration_amount = 5
for random_datapoint in prompt_dataset_pair_object:
    print(random_datapoint) 

# Example 2:
for random_datapoint in prompt_dataset_pair_object(5):
    print(random_datapoint)

# Example 3: using next()
prompt_dataset_pair_object.iteration_amount = -1
print(next(prompt_dataset_pair_object))

('This is a sentence', 'i feel honored that you would think of me as inspiring')
('Can you give to me a statement', 'i am feeling a curious sense of relief a lightness that i never thought possible back when sex seemed to be the most desirable of desires and the ultimate act of self validation')
('Create a piece of text', 'i say his helpless the phone muttered the i love you love his feeling always feel very sweet always feel to have him with me i nothing a person undertaking no matter where there is a he')
('This is a piece of text', 'i was feeling quite broke')
('Write a fearful sentence', 'i am left to feel helpless to do anything')
('Reveal a quote', 'i feel myself uncertain as to the next step to take')
('Could i have a statement', 'i feel the time at hand my beloved signals his agreement')
('Display a statement', 'i forgive myself that i have accepted and allowed myself to feel terrified when i can not move myself or speak or scream in sleep paralysis')
('Write a scarred citation

## Few shot learning

Puts procedurally generated prompts from prompt_creator into list of random size.

### Define FewShotGenerator class

In [80]:
class FewShotGenerator():
    def __init__(self, prompt_dataset_pair:PromptDatasetPair, list_structure:dict, iteration_amount:int = 5, few_shot_range:tuple=(0,3)):
        self.prompt_dataset_pair:PromptDatasetPair = prompt_dataset_pair
        self.prompt_dataset_pair.iteration_amount = -1
        self.list_structure:dict = list_structure
        self.few_shot_range:tuple = few_shot_range
        self.iteration_amount:int = iteration_amount
        self.iteration_index:int = 0
    
    def __iter__(self):
        ''' Iterator protocol that returns an iterator '''
        self.iteration_index = 0
        return self
    
    def __call__(self, iteration_amount:int):
        ''' Allows the assignment of self.iteration_amount by call '''
        self.iteration_amount = iteration_amount
        self.iteration_index = 0
        return self

    def __next__(self) -> tuple:
        ''' Iterator protocol that returns the next item '''
        if self.iteration_index >= self.iteration_amount and self.iteration_amount >= 0:
            self.iteration_index = 0
            raise StopIteration

        self.iteration_index += 1
        
        few_shot_amount:int = random.randint(self.few_shot_range[0], self.few_shot_range[1])
        indices:tuple = random.choice(self.list_structure["index"])
        start:str = random.choice(self.list_structure["start"])
        middle:str = random.choice(self.list_structure["middle"])
        end:str = random.choice(self.list_structure["end"])

        text:str = ""
        label:str = ""
        index:int = 0
        for single_prompt, single_text in self.prompt_dataset_pair(few_shot_amount + 1):
            text += indices[index] + start + single_prompt
            if index == few_shot_amount:
                label = single_text
            else:
                text += middle + single_text + end
            index += 1

        return (text, label)

### How to use FewShotGenerator class

In [82]:
# Instanciate FewShotGenerator object
few_shot_object = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object, list_structure=dataset_dictionary["list_structure"], few_shot_range=(0,3))

In [83]:
# Create examples from class object:
# Example 1:
few_shot_object.iteration_amount = 5
for random_datapoint in few_shot_object:
    print(random_datapoint)

# Example 2:
for random_datapoint in few_shot_object(5):
    print(random_datapoint)

# Example 3: using next()
few_shot_object.iteration_amount = -1
print(next(few_shot_object))

('A\nMake a fearful piece of text', 'im feeling indecisive about what i want to do with the rest of my life')
('1: Get me a piece of text\nim not being fair to xia by doing it this way if he feels frightened by the work i do it that his fault\n\n2: This is a statement\ni am ever feeling nostalgic about the fireplace i will know that it is still on the property\n\n3: Display a delightful citation', 'i got a haircut today so yes i feel handsome')
('A: Get me a piece of text\nive heard stories about julie baileys treatment before now but this is the first time i seen anything in print and it makes me feel deeply ashamed that someone who stood up neglected nhs patients and their families can become so isolated in her own community\n\nB: Please show me melancholic sentence\ni feel my blood pound up my back and in my ears and i throw up it hurts point blank and period it hurts\n\nC: Create a statement\ni feel like i havent been as compassionate toward him as i should be\n\nD: Make a sad cita

## Procedual dataset generator

Applies all classes above to create our dataset.

### Define DatasetGenerator class

In [84]:
class DatasetGenerator():
    ''' Uses PromptDatasetPair class to create a dataset '''
    def __init__(self, few_shot_generator:FewShotGenerator):
        self.few_shot_generator:FewShotGenerator = few_shot_generator
    
    def Create(self) -> dict:
        ''' Returns dictionary of created dataset '''
        created_dataset:dict = {
            "text": [],
            "label": []
        }
        for text, label in self.few_shot_generator:
            created_dataset["text"].append(text)
            created_dataset["label"].append(label)

        return created_dataset

### How to use DatasetGenerator class

In [85]:
# Instanciate DatasetGenerator object
dataset_generator = DatasetGenerator(prompt_dataset_pair_object(1_000))

In [86]:
# Create our dataset
procedual_dataset:dict = dataset_generator.Create()
# Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=procedual_dataset, file_name="my_emotion_dataset")

## Procedual datasets generator

Creates concatenation of all parts of our dataset

### Define DatasetsGenerator class

In [112]:
class DatasetsGenerator():
    ''' Creates dataset with all parts of downloaded dataset '''
    def __init__(self, sorted_dataset:dict, few_shot_range:tuple=(0,3), dataset_training_size:int = 10_000, dataset_other_size:int = 1_000):
        self.sorted_dataset:dict = sorted_dataset
        self.prompt_creator:PromptCreator = PromptCreator(prompt_structure=sorted_dataset["prompt_structure"], labels=sorted_dataset["labels"], convert_label_to_index=False)
        self.dataset_creators:dict = {}
        self.list_structure = sorted_dataset["list_structure"]

        is_beginning:bool = True
        for key, item in self.sorted_dataset["datasets"].items():
            prompt_dataset_pair_object = PromptDatasetPair(dataset=item, prompt_creator=self.prompt_creator)
            few_shot_object = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object, list_structure=self.list_structure, few_shot_range=few_shot_range)
            dataset_creator_object = DatasetGenerator(few_shot_object(dataset_training_size if is_beginning else dataset_other_size))
            self.dataset_creators[key] = dataset_creator_object
            is_beginning = False

    def Create(self) -> dict:
        ''' Returns dictionary of created dataset '''
        datasets:dict = {}
        data_structure:dict = {
            "text" : [],
            "label" : []
        }

        for key, item in self.dataset_creators.items():
            datasets[key] = item.Create()

        return datasets

### How to use DatasetsGenerator class

In [122]:
my_complete_dataset_object = DatasetsGenerator(sorted_dataset=sorted_dataset, few_shot_range=(0,3), dataset_training_size=10_000, dataset_other_size=1_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_emotion_dataset")


train:
    text:
        list: ['a:\nThis is a sad sentence\ni feel all greedy\n\nb:\nCan you give to me unexpected citation\ni feel like i m trying to be that guy who hangs out with curious george\n\nc:\nWrite a delightful quote', 'A. May I a have an unhappy citation\ni cant sleep and re read happy posts and i go past the one about picnic day and i get so happy im like james you make me so happy i love you and then repeat as soon as i feel jealous\nB. May I a have a happy statement\ni even feel her hair looks superior here\nC. Write a delightful quote\nive been feeling the demands of my three beloved males pushing and pulling spinning me around as i dance to the beat of their drum\nD. Could i have a quote', '1.\nReveal a surpised sentence\nive been reading again and feeling pleasantly surprised to find my reading list contained four similar books a fine chance to compare and contrast differ\n2.\nWrite unexpected piece of text\nim feeling a little overwhelmed\n3.\nMay I a have a deligh

# Putting it all together

## Creating a procedurally generated dataset from [emotion](https://huggingface.co/datasets/emotion) dataset

### From scratch

In [114]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [115]:
# 2. Define labels
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
# 3. Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        "love": ('a', ),
        "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "fear": ('a scarred', 'a fearful', 'a'),
        "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}
# 4. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 5. Instanciate DatasetLoader object
dataset_loader = DatasetLoader()

# 5. Load dataset from huggingface
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="emotion", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_dictionary_sorted:dict = dataset_loader.SortDictionary(dataset_dictionary=dataset_dictionary)

# 6. Save it
dataset_loader.SaveDataset(dataset_dictionary, file_name="emotion_dataset")

# 7. Print parts of dataset
DatasetLoader.PrintDataset(dataset_dictionary["datasets"])

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

train:
    text:
        list: ['sad', 'sad', 'anger'] +15997 items
    label:
        list: ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong'] +15997 items
validation:
    text:
        list: ['sad', 'sad', 'love'] +1997 items
    label:
        list: ['im feeling quite sad and sorry for myself but ill snap out of it soon', 'i feel like i am still looking at a blank canvas blank pieces of paper', 'i feel like a faithful servant'] +1997 items
test:
    text:
        list: ['sad', 'sad', 'sad'] +1997 items
    label:
        list: ['im feeling rather rotten so im not very ambitious right now', 'im updating my blog because i feel shitty', 'i never make her separate from me because i don t ever want her to feel like i m ashamed with her'] +1997 items


In [116]:
# 8. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.dataset_dictionary["labels"], convert_label_to_index=False)

# 9. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

('Reveal a surpised quote', 'surprise')
('Please show me a depressed citation', 'sad')
('Reaveal to me a happy sentence', 'joy')
('Reveal a joyful statement', 'joy')
('Please show me a fearful quote', 'fear')


In [117]:
# 10. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["test"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_validation = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["validation"], prompt_creator=prompt_creator)

# 11. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))
print(next(prompt_dataset_pair_object_validation))

('Get me a scarred piece of text', 'i feel is doubtful but then again i could be wrong')
('Print a not so happy quote', 'i feel around someone the more idiotic i feel hence the unintelligible blabbering')
('Reaveal to me a sorrowful statement', 'i feel like waiting for you to be online and you didnt makes me furious')


In [118]:
# 12. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_validation = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_validation, list_structure=list_structure, few_shot_range=(0,3))

# 13. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))
print(next(few_shot_object_validation))

('a.\nPrint a quote\ni feel isolated as a stay at home mum shonas story notes d athe only negative for me is that i feel isolated as a stay at home mum\n\nb.\nWrite an unhappy piece of text\ni think that now if i were to ride it without you or with another person present i would feel disheartened\n\nc.\nCould i have a surpised piece of text\ni feel about my mommy amp me friends our friendships grew so naturally the strength of them surprised me\n\nd.\nDisplay a sorrowful quote', 'i started to feel dissatisfied by the ease and convenience of it all')
('1\nGet me unexpected piece of text\ni have spoken about before but the feeling is getting stronger and i am curious if others have similar thoughts\n\n2\nMay I a have a gleefull sentence\ni feel like i am less of a woman less of a person less valuable because im not married and not dating\n\n3\nCreate a not glad statement', 'i feel there is no excuse for lame invitations')
('1. Display a surpised statement\ni feel impressed by the profess

In [119]:
# 14. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(100_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(1_000))
dataset_creator_validation = DatasetGenerator(few_shot_object_validation(1_000))

# 15. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()
procedual_dataset_validation:dict = dataset_creator_validation.Create()

# 16. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test,
    "validation": procedual_dataset_validation
}

# 17. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

train:
    text:
        list: ['A:\nDisplay a surpised citation\ni knows is the boy makes her feel weird and yuuki doesnt know what to tell her\n\nB:\nGet me unexpected statement', '1\nWrite a not so happy quote\ni no longer feel depressed and am not mad or haven t yet a href http www\n\n2\nCan you give to me a scarred piece of text', 'a.\nGet me a not glad citation\ni feel like i shouldnt bother people with these petty stupid little pathetic thoughts i feel like no one really would care to know what really goes on inside my head\n\nb.\nWrite a surpised sentence\ni wander into the depths of the markets because i m feeling curious\n\nc.\nPrint a glad statement\ni feel like the thing that i call an artistic tendency in myself is really just laziness and narcissism justifying and strengthening each other\n\nd.\nCreate unexpected quote'] +99997 items
    label:
        list: ['i am a boy i like girls they are pretty and i like it when they smile at me but it makes me feel funny', 'i notic

In [120]:
# 18. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_emotion_dataset")

### Using DatasetsGenerator 

In [121]:
# Labels for our dataset
labels:tuple = ("sad", "joy", "love", "anger", "fear", "surprise")
# Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "sad": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "joy": ('a happy', 'a joyful', 'a glad', 'a delightful', 'a', 'a gleefull'),
        "love": ('a', ),
        "anger": ('a sad', 'an unhappy', 'a depressed', 'a sorrowful', 'a', 'a not so happy', 'an unglad', 'a not glad', 'melancholic'),
        "fear": ('a scarred', 'a fearful', 'a'),
        "surprise": ('a surpised', 'unexpected', 'a')
    },
    "part3": (' sentence', ' statement', ' piece of text', ' quote', ' citation')
}
# Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# Instanciate DatasetLoader object
dataset_loader = DatasetLoader()
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="emotion", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_loader.SaveDataset(dataset_dictionary, file_name="emotion_dataset")
sorted_dataset:dict = dataset_loader.SortDictionary(dataset_loader.dataset_dictionary)

my_complete_dataset_object = DatasetsGenerator(sorted_dataset=sorted_dataset, few_shot_range=(0,3), dataset_training_size=10_000, dataset_other_size=1_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_emotion_dataset")

Using custom data configuration default
Reusing dataset emotion (/root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

train:
    text:
        list: ['a: This is unexpected sentence\nim feeling is funny because its totally unnecessary\nb: Make a gleefull statement\ni definitely feel he should get a title supporting and the picture for once\nc: Can you give to me a fearful statement\ni feel so like distraught and lost being there\nd: Show me a glad quote\ni threw open my windows for minutes and then we were all freezing so i had to shut them and sat back and enjoyed that feeling of tranquility that only comes in those few minutes precious minutes when everything is spotlessly in order\ne: Create unexpected citation\ni really want to go buy some yardage of art gallery just to play with because it feels so amazing\nf: Can you give to me a statement\ni immediately related to feeling curious about everything\ng: Write unexpected statement\ni shrugged not feeling particularly enthralled about the educational tour and feeling guilty that i would prefer to stay at home and play house\nh: May I a have a depres

## Creating a procedurally generated dataset from [ag_news](https://huggingface.co/datasets/ag_news) dataset

### From scratch

In [123]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [125]:
# 2. Define labels
labels:tuple = ("World", "Sports", "Business", "Sci/Tech")
# 3. Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": ('a article ', 'a news article '),
    "part3": ('regarding ', 'containing ', 'about '),
    "part4": {
        "World": ('world', 'worlds', 'anything', 'something'),
        "Sports": ('sports', 'sport', 'anything', 'something'),
        "Business": ('business', 'anything', 'something'),
        "Sci/Tech": ('sci/tech', 'sci', 'tech', 'anything', 'something')
    }
}
# 4. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 5. Instanciate DatasetLoader object
dataset_loader = DatasetLoader()

# 5. Load dataset from huggingface
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="ag_news", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_dictionary_sorted:dict = dataset_loader.SortDictionary(dataset_dictionary=dataset_dictionary)

# 6. Save it
dataset_loader.SaveDataset(dataset_dictionary, file_name="ag_news_dataset")

# 7. Print parts of dataset
DatasetLoader.PrintDataset(dataset_dictionary["datasets"])

Downloading:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/751k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

train:
    text:
        list: ['Business', 'Business', 'Business'] +119997 items
    label:
        list: ["Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."] +119997 items
test:
    text:
        list: ['Business', 'Sci/Tech', 'Sci/Tech'] +7597 items
    label:
        list: ["Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' af

In [126]:
# 8. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.dataset_dictionary["labels"], convert_label_to_index=False)

# 9. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

('Reveal a article about something', 'World')
('Display a news article about business', 'Business')
('Can you give to me a news article about sci', 'Sci/Tech')
('Get me a article containing something', 'Sports')
('Make a article containing sport', 'Sports')


In [128]:
# 10. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["test"], prompt_creator=prompt_creator)

# 11. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))

('Make a article containing anything', 'Ward represents US boxing team #39;s lone chance for gold The underperforming US boxing team lost one of its two Andres Friday when middleweight Dirrell lost his semifinal bout to Kazakhstan #39;s Gennadiy Golovkin, falling out on points, 23-18.')
('Write a article containing world', 'Israel Announces West Bank Housing Plans (AP) AP - Israel announced plans Monday for 500 new housing units in the West Bank, after an apparent U.S. policy shift that has infuriated the Palestinians. The Palestinians oppose all Jewish settlement in the West Bank and Gaza Strip, lands where they hope to establish an independent state.')


In [129]:
# 12. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))

# 13. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))

('A: Make a news article about something\nHP to lead consumer launch with branded iPod Friday product launch to put music player at forefront of effort to make once-stodgy printer manufacturer "cool."\n\nB: Reveal a article regarding worlds\nGeneral Blames Prison Abuse on Commanders BAGHDAD, Iraq - The Army general who once ran detention operations in Iraq said a "conspiracy" among top U.S. commanders has left her to blame for the abuses of Iraqi inmates at Abu Ghraib prison...\n\nC: Make a article about anything', 'Afghanistan #39;s Karzai in Pakistan for terror talks Afghan President Hamid Karzai began a visit to neighbouring Pakistan on Monday to discuss cooperation in the battle against Islamic militants ahead of his October re-election bid.')


In [130]:
# 14. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(100_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(1_000))

# 15. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()

# 16. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test
}

# 17. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

train:
    text:
        list: ['a.\nMay I a have a article regarding sport\nUAE joins Olympics gold medal winners UAE joins Olympics gold medal winners ABU DHABI, 18 Aug. 04 (WAM) -- A UAE paper said today that Sheikh Ahmed bin Hashar Al Maktoum #39;s winning of the gold medal in double trap shooting had taken the UAE to the pinnacle of sporting glory at the Olympics. \nb.\nShow me a article containing anything', 'a. Can you give to me a article regarding business', 'a: Print a article regarding sport'] +99997 items
    label:
        list: ['Munich to press on with Linux switch More than a year after the German city of Munich declared its landmark intention to abandon Microsoft Windows in favor of upstart rival Linux, its councilors are finally ready to go ahead with the plan.', 'Wrigley to Move Phoenix Production Chewing gum giant Wm. Wrigley Jr. Co. on Thursday said it plans to phase out production of its Eclipse breath strips at a plant in Phoenix, Arizona and shift manufacturing 

In [131]:
# 18. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_ag_news_dataset")

### Using DatasetsGenerator 

In [140]:
# Labels for our dataset
labels:tuple = ("World", "Sports", "Business", "Sci/Tech")
# Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": ('a article ', 'a news article '),
    "part3": ('regarding ', 'containing ', 'about '),
    "part4": {
        "World": ('world', 'worlds', 'anything', 'something'),
        "Sports": ('sports', 'sport', 'anything', 'something'),
        "Business": ('business', 'anything', 'something'),
        "Sci/Tech": ('sci/tech', 'sci', 'tech', 'anything', 'something')
    }
}
# Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# Instanciate DatasetLoader object
dataset_loader = DatasetLoader()
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="ag_news", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_loader.SaveDataset(dataset_dictionary, file_name="ag_news_dataset")
sorted_dataset:dict = dataset_loader.SortDictionary(dataset_loader.dataset_dictionary)

my_complete_dataset_object = DatasetsGenerator(sorted_dataset=sorted_dataset, few_shot_range=(0,3), dataset_training_size=10_000, dataset_other_size=1_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_ag_news_dataset")

Using custom data configuration default
Reusing dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

train:
    text:
        list: ["1: Create a article containing something\nBuilding permits increase 5.7, reversing June downturn WASHINGTON (CBS.MW) -- Construction of new homes recovered in July, as US homebuilders started homes at a seasonally adjusted annual rate of 1.978 million, the Commerce Department said Tuesday. \n2: Reveal a article containing anything\nHEADLINE STOCKS: Stocks to Watch, Nov. 15  NEW YORK (Reuters) - U.S. stocks to watch on Monday include  Family Dollar Stores Inc., Staples Inc., Microsoft Corp., and  General Electric Co.  \n3: Could i have a article regarding sport\nSoderling upsets Spadea to reach Lyon Open final Sweden #39;s Robin Soderling beat seventh-seeded American Vince Spadea 6-2, 6-4 Saturday to reach the final of the Lyon Open. Soderling will play Belgium #39;s Xavier Malisse in Sunday #39;s final.\n4: Reveal a news article containing sci/tech\nSpace Radiation May Harm Astronauts' Blood Cells In the time it takes you to read this sentence, more tha

## Creating a procedurally generated dataset from [yelp_polarity](https://huggingface.co/datasets/yelp_polarity) dataset

### From scratch

In [133]:
# 1. Install all requirements, import all modules and define all classes
# Done above

In [134]:
# 2. Define labels
labels:tuple = ("Negative", "Positive")
# 3. Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "Negative": ('a negative ', ),
        "Positive": ('a positive ', ),
    },
    "part3": ('review', ),
}
# 4. Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# 5. Instanciate DatasetLoader object
dataset_loader = DatasetLoader()

# 5. Load dataset from huggingface
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="yelp_polarity", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_dictionary_sorted:dict = dataset_loader.SortDictionary(dataset_dictionary=dataset_dictionary)

# 6. Save it
dataset_loader.SaveDataset(dataset_dictionary, file_name="yelp_polarity_dataset")

# 7. Print parts of dataset
DatasetLoader.PrintDataset(dataset_dictionary["datasets"])

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_polarity/plain_text (download: 158.67 MiB, generated: 421.07 MiB, post-processed: Unknown size, total: 579.73 MiB) to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544...


Downloading:   0%|          | 0.00/166M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset yelp_polarity downloaded and prepared to /root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

train:
    text:
        list: ['Negative', 'Positive', 'Negative'] +559997 items
    label:
        list: ["Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.", "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I

In [135]:
# 8. Instanciate PromptCreator object
prompt_creator = PromptCreator(prompt_structure=prompt_structure, labels=dataset_loader.dataset_dictionary["labels"], convert_label_to_index=False)

# 9. Check to see that it is working
for random_prompt in prompt_creator(5):
    print(random_prompt)

('Show me a positive review', 'Positive')
('Make a negative review', 'Negative')
('Reveal a positive review', 'Positive')
('May I a have a negative review', 'Negative')
('Create a positive review', 'Positive')


In [136]:
# 10. Instanciate PromptDatasetPair objects
prompt_dataset_pair_object_train = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["train"], prompt_creator=prompt_creator)
prompt_dataset_pair_object_test = PromptDatasetPair(dataset=dataset_dictionary_sorted["datasets"]["test"], prompt_creator=prompt_creator)

# 11. Check to see that it is working
print(next(prompt_dataset_pair_object_train))
print(next(prompt_dataset_pair_object_test))

('Create a negative review', "I have been a member of this gym since it opened. I used to really like this gym, but am seriously considering switching to the new LVAC opening on Decatur and the 215. Every summer the A/C breaks down. Lately, the Cardio Theatre treadmills have been out of commission and the gym itself has been way too hot. I'm surprised they actually have the stair masters fixed correctly. They are often also out of paper towels and disinfectant spray. The ladies' restroom stalls can be very messy on Sunday's too. While before I would recommend this gym to anyone in the North Las Vegas area, I cannot say the same now.")
('Create a negative review', "This the worse lab and office staff , you wait hrs for them to take you back . My tax money go for lab crop to bump firemen to be the front . The office staff are very rude . Don't go there")


In [137]:
# 12. Instanciate FewShotGenerator object
few_shot_object_train = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_train, list_structure=list_structure, few_shot_range=(0,3))
few_shot_object_test = FewShotGenerator(prompt_dataset_pair=prompt_dataset_pair_object_test, list_structure=list_structure, few_shot_range=(0,3))

# 13. Check to see that it is working
print(next(few_shot_object_train))
print(next(few_shot_object_test))

("1. May I a have a negative review\nI went by Texas BBQ House today, and it appears they have closed down already. I thought the extra lean brisket there was decent, but not as good as Rudy's, which I recommend as an alternative. One thing that rankled me about Texas BBQ House was the high price on the stuffed baked potato. Another issue was the parking. Finally, the dining area was very cramped. I've never been to the South Phoenix location, but if I'm going to drive that far, I would go to Rudy's instead.\n\n2. Create a positive review\nGreat food. The price is a little high for they type of food but a good meal\n\n3. May I a have a positive review", "This is a great comic shop. It is family friendly and usually has some kind of event going on every so many months. They are very loyal to their customers and provide a wide variety of comics to choose from. If you have any questions they're happy to help and if you're looking for suggestions they usually have a good one! The only draw

In [138]:
# 14. Instanciate DatasetGenerator objects set to different magnitude of datapoints
dataset_creator_train = DatasetGenerator(few_shot_object_train(100_000))
dataset_creator_test = DatasetGenerator(few_shot_object_test(1_000))

# 15. Create datasets
procedual_dataset_train:dict = dataset_creator_train.Create()
procedual_dataset_test:dict = dataset_creator_test.Create()

# 16. Concatinate into one dictionary
my_dataset:dict = {
    "train": procedual_dataset_train,
    "test": procedual_dataset_test
}

# 17. Print parts of our dataset
DatasetLoader.PrintDataset(my_dataset)

train:
    text:
        list: ['a:\nGet me a positive review', "a. Could i have a negative review\nIn fact the service for my order was so damn bad, i decided to register to yelp and write a review for all to know...\\n\\nFirst time i order from dominos online, i get this store, place my order, pay online and everything is does fine...\\n\\nExcept it's been over an hour and i'm still waiting for my order to arrive, according to dominos online order tracker, my order has been out for delivery for 64 minutes.... \\nWhat kind of service is that, i'm currently on the phone to complain with the manager, and guess what, it's been 9 minutes since my called was answered by their system, and i can't even speak to a human, all they got is a recording of all their specials... \\n\\nUPDATE: just hanged up with the store's manager told me to call the general manager at the store in Sahara and Valley View... called there, and their manager shouted to me on the phone, you can tell they are having a 

In [139]:
# 18. Save our dataset
DatasetLoader.SaveDataset(dataset_dictionary=my_dataset, file_name="my_yelp_polarity_dataset")

### Using DatasetsGenerator 

In [142]:
# Labels for our dataset
labels:tuple = ("Negative", "Positive")
# Holds structure for prompts
prompt_structure:dict = {
    "part1": ('Write ', 'Make ', 'This is ', 'Create ', 'Show me ', 'Display ', 'Print ', 'Reveal ', 'Reaveal to me ', 'Get me ', 'May I a have ', 'Could i have ', 'Can you give to me ', 'Please show me '),
    "part2": {
        "Negative": ('a negative ', ),
        "Positive": ('a positive ', ),
    },
    "part3": ('review', ),
}
# Holds structure for how prompts should be listed
list_structure:dict = {
    "index": (("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"), ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"), ("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")),
    "start": (". ", ".\n", "\n", ": ", ":\n"),
    "middle": ("\n", ),
    "end": ("\n", "\n\n")
}

# Instanciate DatasetLoader object
dataset_loader = DatasetLoader()
dataset_dictionary:dict = dataset_loader.CreateDictionaryFromDataset(dataset_name="yelp_polarity", labels=labels, prompt_structure=prompt_structure, list_structure=list_structure)
dataset_loader.SaveDataset(dataset_dictionary, file_name="yelp_polarity_dataset")
sorted_dataset:dict = dataset_loader.SortDictionary(dataset_loader.dataset_dictionary)

my_complete_dataset_object = DatasetsGenerator(sorted_dataset=sorted_dataset, few_shot_range=(0,3), dataset_training_size=10_000, dataset_other_size=1_000)
my_complete_dataset:dict = my_complete_dataset_object.Create()

DatasetLoader.PrintDataset(my_complete_dataset)
DatasetLoader.SaveDataset(dataset_dictionary=my_complete_dataset, file_name="my_yelp_polarity_dataset")

Reusing dataset yelp_polarity (/root/.cache/huggingface/datasets/yelp_polarity/plain_text/1.0.0/a770787b2526bdcbfc29ac2d9beb8e820fbc15a03afd3ebc4fb9d8529de57544)


  0%|          | 0/2 [00:00<?, ?it/s]

train:
    text:
        list: ['a. Write a negative review\nScam. Like the other reviewers company did not ship product and I was forced to fill out a form with a significant amount if personal info including bday, phone, address, etc. I complained to amazon who claims they will be taking the vendor off their site but well see. Also check out bbb.org to see their 24 (at the time of writing this) complaints over the last several years. Amazon local, living social, groupon and the other places I\'ve seen this company on are sites that sell coupons for company\'s, so what the hell does this company do? They sell coupons to the company\'s that sell coupons.\n\nb. Get me a positive review\nStayed here when I needed somewhere affordable and not necessarily on the strip.  The rooms were nice and clean.  They were definitely worth the price.  My group and I even had a lot of fun in the casino.  The room had a minor maintenance issue during my stay but the staff was very responsive and had it 