# Imports

In [None]:
! git clone https://github.com/ai-forever/sage.git

In [None]:
cd sage

In [None]:
# change to pip install -e ".[errant]" in case of zsh

! pip install .
! pip install -e .[errant]

In [13]:
import os
from sage.pipeline import PipelineConfig
from sage.pipeline import AugmentationPipeline
from sage.utils import DatasetsAvailable

# Basic Use

Creates an augmentation pipeline that automatically adds and shuffles all available augmentors without manual configuration.

In [None]:
pipeline_config = PipelineConfig()
pipeline = AugmentationPipeline(config=pipeline_config)

sample_text = "Заметьте, не я это предложил!"
augmented_text = pipeline.augment(sample_text, seed=1)

print(augmented_text)

  0%|          | 0/1054 [00:00<?, ?it/s]

In [2]:
pipeline_config = PipelineConfig('en')
pipeline = AugmentationPipeline(config=pipeline_config)

sample_text = "Screw you guys, I am going home. (c)"
augmented_text = pipeline.augment(sample_text, seed=1)

print(augmented_text)

  0%|          | 0/1601 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

home. you kuys, I c) goingScrew m


# Custom use

### Russian

Creates a pipeline , manually sets parameters for the character augmentor and SBS corruptor, creates an AugmentationPipeline with custom settings and disabled shuffling, and manually adds only the necessary augmentors (character and SBS corruptor). 

In [4]:
pipeline_config = PipelineConfig()

# Set only the necessary parameters in the config using methods
pipeline_config.set_char_params(min_aug=2, max_aug=4, unit_prob=0.3)
pipeline_config.set_sbsc_params(lang="ru", dataset_name_or_path=DatasetsAvailable.MedSpellchecker.name, dataset_split="test")

pipeline = AugmentationPipeline(config=pipeline_config, shuffle=False)

# Add the necessary augmenters
pipeline.add_char_augmenter()
pipeline.add_sbsc_augmenter()

# Manually set the order of augmenters
pipeline.set_order([1, 0])  # This sets the SBS corruptor to be applied first, followed by the character augmenter

sample_text = "Заметьте, не я это предложил!"
augmented_text = pipeline.augment(sample_text, seed=1)
print(augmented_text)

  0%|          | 0/1054 [00:00<?, ?it/s]

  0%|          | 0/1054 [00:00<?, ?it/s]

Зметьте, не педложи! этоя


### English

In [12]:
pipeline_config = PipelineConfig()

pipeline_config.set_word_params(min_aug=1, max_aug=3, unit_prob=0.3)
pipeline_config.set_sbsc_params(lang="en", dataset_name_or_path=os.path.join("data", "example_data", "jfleg"), dataset_split="test")

pipeline = AugmentationPipeline(config=pipeline_config, shuffle=False)

pipeline.add_char_augmenter()
pipeline.add_word_augmenter()
pipeline.add_sbsc_augmenter()

text = "Screw you guys, I am going home. (c)"
augmented_text = pipeline.augment(text, seed=1)
print(augmented_text)

  0%|          | 0/1601 [00:00<?, ?it/s]

  0%|          | 0/1601 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

ome. yau kuys, goin I() Screwm
