In [None]:
!pip install -q datasets
!pip install openai

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import math
from tqdm.notebook import tqdm
import re
from openai import OpenAI
api_key = "<your Open AI key>"

### Auxiliary functions


In [3]:
# Auxiliary functions

def generate_prompt(class_name, examples=None, n_requested=None, utterance_field='text'):
  """Creates a prompt to generate new training data"""

  messages = []

  system_dict = {"role":"system", "content": "I am training an Intent Classifier model. I need to generate more training data"}
  messages.append(system_dict)

  msg = f"Here is a list of examples belonging to class {class_name}"
  text = [msg]

  examples_text = ""
  if examples is not None:
    examples_text = []
    for i in range(len(examples)):
      example = examples.iloc[i]
      examples_text.append(f"Sentence: {example[utterance_field]}")

    #examples_text.append('')
    examples_text = '\n'.join(examples_text)

  if n_requested is None:
    n_requested = len(examples)

  text = '\n'.join([msg, examples_text, f"Generate {n_requested} more examples for the class {class_name}"])
  user_dict = {"role":"user", "content": text}
  messages.append(user_dict)
  return messages


def extract(text):
  """ Extracts the generated sentences """
  
  extraction = []
  sentences = text.split('\n')
  for s in sentences:
    i = s.find('Sentence')
    if i == -1:
      continue
    extraction.append(s[i+9:].strip())
  if len(extraction) == 0:
    extraction = re.findall('"([^"]*)"', text)
  if len(extraction) <2:
    extraction = re.findall('\d\..*', text)
    extraction = [s[3:] for s in extraction]
  return extraction

In [None]:
for VERSION in [2]:
  for PCT in [3]:
    name = f"benayas/massive_chatgpt_{PCT}pct_v{str(VERSION)}"
    print(name)
    ds = load_dataset(name)['train']
    ds = ds.to_pandas()
    ds['text'] = ds.apply(lambda x: x['text'] if x['text'] is not None else x['utt'] , axis=1)
    ds = ds[['category','text']]
    ds = Dataset.from_pandas(ds)
    ds.push_to_hub(name)
ds

In [None]:
HF_TOKEN = "<your-hf-token>"
MAX_PER_CALL = 10
FACTOR = 20
client = OpenAI(api_key = api_key)

# List of available datasets
DATASET_LIST = ['atis', 'massive', 'banking', 'snips']

# Pct of sampled data. If used with the baseline type, this is the percentage used
# If used with Full, it will be ignores
# If used with any other type, it will be used to fetch the dataset from the repo.
# Currently only available 1, 3 and 5 pct
PCT_LIST = [1, 3, 5]

for DATASET_NAME in DATASET_LIST:
  ds = load_dataset(f"benayas/{DATASET_NAME}")
  df = ds['train'].to_pandas()
  df['text'] = df['utt']
  df = df[['category','text']]
  for VERSION in [0,1,2]:
      for PCT in PCT_LIST:
          DATASET_ID = f"{DATASET_NAME}_llama2_{PCT}pct_v{VERSION}"
          print(f"DATASET: {DATASET_NAME}, VERSION: {VERSION}, PCT: {PCT}")

          examples = {}
          df_artificial = []

          RATIO = PCT/100
          np.random.seed(VERSION)

          # Extract a sample per class - baseline
          df_sample = []

          for name, g in df.groupby('category'):
              n = math.ceil(len(g)*RATIO)
              sample = g.sample(n, random_state=VERSION)
              df_sample.append(sample)

          df_sample = pd.concat(df_sample)

          # Calculate how many are needed per class
          required_examples = {}
          for name, g in df_sample.groupby('category'):
              required_examples[name] = len(df[df['category']==name]) - len(g)

          # Generate examples
          for name, g in tqdm(df_sample.groupby('category')):
              #print(name)
              if name not in examples:
                  examples[name] = set()

              with tqdm(total=required_examples[name]) as pbar:
                  while len(examples[name]) < required_examples[name]:
                      # extract a random subsample
                      df_tmp = g.sample(min(MAX_PER_CALL, len(g)))

                      # Generate a prompt with the subsample
                      prompt = generate_prompt(name, df_tmp, len(df_tmp)*FACTOR)

                      # Call model to generate example
                      response = client.chat.completions.create(
                                    model="gpt-3.5-turbo",
                                    messages=prompt
                                ).choices[0].message.content

                      # Extract the example from the response
                      sentences = extract(response)

                      if len(sentences) == 0:
                          continue

                      # Add the example to the set of examples if still not reached the required number
                      n_update = 0
                      for s in sentences:
                          if len(examples[name]) < required_examples[name] and s not in examples[name]:
                              examples[name].add(s)
                              n_update += 1

                      if n_update == 0:
                          continue

                      pbar.update(n_update)

              # Put the data in a different format for pandas
              for s in examples[name]:
                  df_artificial.append({'text': s, 'category': name})

          # Add the sampled data into the dataframe
          df_artificial = pd.concat([df_sample, pd.DataFrame(df_artificial)])

          # Upload dataset if everything went ok
          ds_artificial = Dataset.from_pandas(df_artificial)
          ds_artificial.push_to_hub(DATASET_ID, token=HF_TOKEN)