In [None]:
pip install transformers pandas openpyxl sacremoses datasets torch

In [None]:
import pandas as pd
import random
import time
from transformers import MarianMTModel, MarianTokenizer

# Define available languages for translation
languages = [
    "es", "ru", "ar", "fr", "az", "zh", "af", "bg", "ca", "da", "de", "fi",
    "gl", "grk", "hu", "it", "jap"
    # , "mul", "nl", "ro", "roa", "sv", "sw", "tl", "trk", "uk", "ur", "vi"
    ]

# Load all models for en → random and random → en translations
def load_all_translation_models(languages):
    models = {}
    for lang in languages:
        try:
            # Load English → Random Language model
            en_to_random_model_name = f'Helsinki-NLP/opus-mt-en-{lang}'
            en_to_random_model = MarianMTModel.from_pretrained(en_to_random_model_name)
            en_to_random_tokenizer = MarianTokenizer.from_pretrained(en_to_random_model_name)

            # Load Random Language → English model
            random_to_en_model_name = f'Helsinki-NLP/opus-mt-{lang}-en'
            random_to_en_model = MarianMTModel.from_pretrained(random_to_en_model_name)
            random_to_en_tokenizer = MarianTokenizer.from_pretrained(random_to_en_model_name)

            # Store models and tokenizers
            models[lang] = {
                "en_to_random": (en_to_random_model, en_to_random_tokenizer),
                "random_to_en": (random_to_en_model, random_to_en_tokenizer),
            }
            print(f"Loaded models for language: {lang}")
        except Exception as e:
            print(f"Error loading models for language {lang}: {e}")
    return models

# Load translation models for all languages
print("Loading translation models...")
language_models = load_all_translation_models(languages)

# Load Indonesian → English model
print("Loading Indonesian → English model...")
id_to_en_model, id_to_en_tokenizer = None, None
try:
    id_to_en_model, id_to_en_tokenizer = MarianMTModel.from_pretrained(
        'Helsinki-NLP/opus-mt-id-en'), MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-id-en')
    print("Indonesian → English model loaded successfully.")
except Exception as e:
    print(f"Error loading Indonesian → English model: {e}")

# Load English → Indonesian back-translation model
print("Loading English → Indonesian model...")
back_translation_model, back_translation_tokenizer = None, None
try:
    back_translation_model, back_translation_tokenizer = MarianMTModel.from_pretrained(
        'Helsinki-NLP/opus-mt-en-id'), MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-id')
    print("English → Indonesian model loaded successfully.")
except Exception as e:
    print(f"Error loading English → Indonesian model: {e}")

# Translate text using preloaded models
def translate_random_language_with_preloaded_models(text):
    """
    Translate the text to a random language using preloaded models,
    then back to English, and finally back to Indonesian.
    """
    # Step 1: Indonesian → English
    id_to_en = id_to_en_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    en_text = id_to_en_model.generate(**id_to_en)
    en_text = id_to_en_tokenizer.decode(en_text[0], skip_special_tokens=True)

    # Step 2: English → Random Language
    random_lang = random.choice(list(language_models.keys()))
    en_to_random_model, en_to_random_tokenizer = language_models[random_lang]["en_to_random"]
    translated = en_to_random_tokenizer(en_text, return_tensors="pt", padding=True, truncation=True)
    random_lang_text = en_to_random_model.generate(**translated)
    random_lang_text = en_to_random_tokenizer.decode(random_lang_text[0], skip_special_tokens=True)

    # Step 3: Random Language → English
    random_to_en_model, random_to_en_tokenizer = language_models[random_lang]["random_to_en"]
    random_to_en = random_to_en_tokenizer(random_lang_text, return_tensors="pt", padding=True, truncation=True)
    en_back_text = random_to_en_model.generate(**random_to_en)
    en_back_text = random_to_en_tokenizer.decode(en_back_text[0], skip_special_tokens=True)

    # Step 4: English → Indonesian (Back Translation)
    back_translated = back_translation_tokenizer(en_back_text, return_tensors="pt", padding=True, truncation=True)
    back_translated_text = back_translation_model.generate(**back_translated)
    back_to_indonesian_text = back_translation_tokenizer.decode(back_translated_text[0], skip_special_tokens=True)

    # Trim the output to ensure it's a concise review (max 100 tokens)
    max_tokens = 100
    review_output = back_to_indonesian_text.strip()
    if len(review_output.split()) > max_tokens:
        review_output = ' '.join(review_output.split()[:max_tokens])

    return review_output

# Load the dataset
file_path = '/content/DataRaw1.xlsx'  # Replace with your file path
data = pd.read_excel(file_path, header=None)

# Assign meaningful column names
data.columns = ['username1', 'name', 'label', 'review', 'name', 'nrp']

# Remove rows with invalid labels (not 0, 1, or 2)
data_cleaned = data[data['label'].isin([0, 1, 2])]

# Check the distribution of labels
label_counts = data_cleaned['label'].value_counts()
print("Label distribution before augmentation:", label_counts)

# Function to augment reviews with translation
def augment_reviews_with_translation(reviews, existing_reviews, target_count):
    """
    Augment reviews by generating text that simulates translation, ensuring no duplicates.
    """
    augmented_reviews = set()
    reviews_set = set(reviews)
    total_needed = target_count - len(reviews)

    print(f"Starting augmentation: Need {total_needed} reviews.")
    start_time = time.time()

    while len(augmented_reviews) < total_needed:
        for review in reviews:
            try:
                # Generate the augmented review
                augmented_review = translate_random_language_with_preloaded_models(review)

                # Validate response to ensure no duplicates and meaningful output
                if augmented_review and augmented_review not in reviews_set and augmented_review not in augmented_reviews:
                    augmented_reviews.add(augmented_review)
                    print(f"Accepted review: {augmented_review}")
                    print(f"Generated {len(augmented_reviews)} of {total_needed} reviews.")
                    print(f"Current runtime: {time.time() - start_time:.2f} seconds.")
                    if len(augmented_reviews) >= total_needed:
                        break
            except Exception as e:
                print(f"Error during augmentation: {e}")
                continue

        # Prevent infinite loop if no new reviews are generated
        if len(augmented_reviews) == 0:
            print("No new reviews generated. Exiting to prevent infinite loop.")
            break

    return list(reviews_set) + list(augmented_reviews)

# Separate reviews by label
label_0_reviews = data_cleaned[data_cleaned['label'] == 0]['review'].tolist()
label_1_reviews = data_cleaned[data_cleaned['label'] == 1]['review'].tolist()
label_2_reviews = data_cleaned[data_cleaned['label'] == 2]['review'].tolist()

# Existing reviews (to avoid duplicates)
existing_reviews = set(data_cleaned['review'].tolist())

# Calculate target counts based on label 0 size
target_count = len(label_0_reviews)

# Augment reviews for labels 1 and 2 to match label 0
augmented_label_1_reviews = augment_reviews_with_translation(label_1_reviews, existing_reviews, target_count)
augmented_label_2_reviews = augment_reviews_with_translation(label_2_reviews, existing_reviews, target_count)

# Create augmented dataframes
label_1_df = pd.DataFrame({'label': 1, 'review': augmented_label_1_reviews})
label_2_df = pd.DataFrame({'label': 2, 'review': augmented_label_2_reviews})
label_0_df = data_cleaned[data_cleaned['label'] == 0][['label', 'review']]

# Combine all data into a single dataframe
balanced_data = pd.concat([label_0_df, label_1_df, label_2_df])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

# Save the augmented and balanced dataset
output_path = 'aug_data1.xlsx'
balanced_data.to_excel(output_path, index=False)
print(f"Augmented and balanced dataset saved to {output_path}")


Loading translation models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Loaded models for language: es


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

Loaded models for language: ru


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

Loaded models for language: ar


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Loaded models for language: fr


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/226M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/451k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/470k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/598k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/226M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/470k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/451k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/598k [00:00<?, ?B/s]

Loaded models for language: az


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Loaded models for language: zh


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/297M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/819k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/297M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/819k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Loaded models for language: af


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/305M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/999k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.33M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/997k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Loaded models for language: bg


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/292M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/815k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Loaded models for language: ca


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Loaded models for language: da


config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Loaded models for language: de


config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/832k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Loaded models for language: fi


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/293M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/222M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/450k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/443k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

Loaded models for language: gl


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/225M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/507k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/441k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/802k [00:00<?, ?B/s]

Loaded models for language: grk


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/792k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/850k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/850k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/792k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Loaded models for language: hu


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/789k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/344M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Loaded models for language: it


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/274M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/509k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/274M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/509k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

Loaded models for language: jap
Loading Indonesian → English model...


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Indonesian → English model loaded successfully.
Loading English → Indonesian model...


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

English → Indonesian model loaded successfully.
Label distribution before augmentation: label
0    891
1    211
2    167
Name: count, dtype: int64
Starting augmentation: Need 680 reviews.
Accepted review: Aku sudah berhenti berharap, tidak lagi melihat masa depan, aku hanya ingin berhenti bernapas.
Generated 1 of 680 reviews.
Current runtime: 5.71 seconds.
Accepted review: Selama-lamanya mereka akan berteriak di dalam kepalaku, dan aku tidak melakukannya, dan aku tidak akan berbuat baik
Generated 2 of 680 reviews.
Current runtime: 13.34 seconds.
Accepted review: Berkat semua orang yang pernah menyakitiku, sampai aku menemukannya sekarang, dan alasanku melakukan ini adalah karena kalian semua.
Generated 3 of 680 reviews.
Current runtime: 19.90 seconds.
Accepted review: Sekarang adalah waktu yang baik untuk kehilangan hidup saya.
Generated 4 of 680 reviews.
Current runtime: 23.64 seconds.
Accepted review: Tidak ada yang menyadari keberadaan saya, tetapi lebih baik bahwa saya tidak perlu 

KeyboardInterrupt: 