In [None]:
!pip install -q datasets
!pip install -q accelerate peft==0.4.0 bitsandbytes==0.40.2 transformers trl==0.4.7
!pip install requests nlpaug sentencepiece sacremoses nltk>=3.4.5

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import math
from tqdm.notebook import tqdm
import random
import nlpaug.augmenter.word as naw
import os

os.environ["OMP_NUM_THREADS"] = "1"

# Create Augmentation objects

In [None]:
# Synonym wordnet
synonym_aug = naw.SynonymAug(aug_src='wordnet')

# back translation
back_translation_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-de',
    to_model_name='Helsinki-NLP/opus-mt-de-en'
)

# word embeddings
bert_aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")


augmenters = []
augmenters.append(synonym_aug)
augmenters.append(back_translation_aug)
augmenters.append(bert_aug)

random.choice(augmenters).augment('I am happy')

In [None]:
HF_TOKEN = "<your-hf-token>"

examples = {}
df_artificial = []

# List of available datasets
DATASET_LIST = ['atis', 'massive', 'banking', 'snips']

# Pct of sampled data. If used with the baseline type, this is the percentage used
# If used with Full, it will be ignores
# If used with any other type, it will be used to fetch the dataset from the repo.
# Currently only available 1, 3 and 5 pct
PCT_LIST = [1, 3, 5]

for DATASET_NAME in DATASET_LIST:
  ds = load_dataset(f"benayas/{DATASET_NAME}")
  df = ds['train'].to_pandas()
  for PCT in PCT_LIST:
    for VERSION in [0, 1, 2]:
      DATASET_ID = f"{DATASET_NAME}_traditional_{PCT}pct_v{VERSION}"
      print(f"DATASET: {DATASET_NAME}, VERSION: {VERSION}, PCT: {PCT}")

      examples = {}
      df_artificial = []

      RATIO = PCT/100
      np.random.seed(VERSION)

      # Extract a sample per class
      df_sample = []

      for name, g in df.groupby('category'):
        n = math.ceil(len(g)*RATIO)
        sample = g.sample(n, random_state=VERSION+200)
        df_sample.append(sample)

      df_sample = pd.concat(df_sample)

      # Calculate how many are needed
      required_examples = {}
      for name, g in df_sample.groupby('category'):
        required_examples[name] = len(df[df['category']==name]) - len(g)

      for name, g in tqdm(df_sample.groupby('category')):
        if name not in examples:
          examples[name] = set()

        with tqdm(total=required_examples[name]) as pbar:
          while len(examples[name]) < required_examples[name]:
            # extract a subsample
            text = g.sample(1)['utt'].iloc[0]

            # Select augmenter
            augmenter = random.choice(augmenters)

            # Generate augmented text
            new_text = augmenter.augment(text, num_thread=1)[0]

            if new_text == text:
              #print(f'Same sentence: {name} --> {new_text}')
              continue

            # Add the example to the set of examples if still not reached the required number
            n_update = 0
            if len(examples[name]) < required_examples[name] and new_text not in examples[name]:
              examples[name].add(new_text)
              pbar.update(1)

        # Put the data in a different format for pandas
        for s in examples[name]:
          df_artificial.append({'text': s, 'category': name})

      # Add the sampled data into the dataframe
      df_artificial = pd.concat([df_sample, pd.DataFrame(df_artificial)])

      # Upload dataset if everything went ok
      ds_artificial = Dataset.from_pandas(df_artificial)
      ds_artificial.push_to_hub(DATASET_ID, token=HF_TOKEN)
