In [None]:
# Cell 1: Installation
#!pip install -q --upgrade deep-translator pandas tqdm


In [None]:
# Cell 2: Prepare Data Directory

# S·ª≠ d·ª•ng Google Colab ƒë·ªÉ load nhanh d·ªØ li·ªáu

import os

# T·∫°o th∆∞ m·ª•c n·∫øu n√≥ ch∆∞a t·ªìn t·∫°i
os.makedirs('/content/data', exist_ok=True)

print("‚úÖ Th∆∞ m·ª•c /content/data ƒë√£ s·∫µn s√†ng.")
print("‚ÄºÔ∏è Vui l√≤ng k√©o v√† th·∫£ file 'spam.csv' c·ªßa b·∫°n v√†o th∆∞ m·ª•c n√†y ·ªü thanh b√™n tr√°i.")

In [None]:
# Cell 3: Setup Optimized Functions

# S·ª≠ d·ª•ng Google Colab ƒë·ªÉ load nhanh d·ªØ li·ªáu

import pandas as pd
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import Counter
from tqdm import tqdm
from deep_translator import GoogleTranslator

# ==========================================================
# H√ÄM D·ªäCH B·ªÄN B·ªà (ROBUST BACK-TRANSLATION FUNCTION)
# T·ª± ƒë·ªông th·ª≠ l·∫°i khi g·∫∑p l·ªói m·∫°ng ho·∫∑c b·ªã gi·ªõi h·∫°n t·ªëc ƒë·ªô.
# ==========================================================
def _back_translate_robust(sentence: str, source_lang: str = 'en', pivot_lang: str = 'vi', retries: int = 3, backoff_in_seconds: int = 5) -> str:
    """Th·ª±c hi·ªán back-translation cho m·ªôt c√¢u v·ªõi c∆° ch·∫ø th·ª≠ l·∫°i."""
    original_sentence = sentence
    for i in range(retries):
        try:
            # D·ªãch xu√¥i (Anh -> Vi·ªát)
            translated = GoogleTranslator(source=source_lang, target=pivot_lang).translate(sentence)
            if not translated:
                continue # N·∫øu d·ªãch ra r·ªóng, th·ª≠ l·∫°i

            # D·ªãch ng∆∞·ª£c (Vi·ªát -> Anh)
            back_translated = GoogleTranslator(source=pivot_lang, target=source_lang).translate(translated)

            # Ch·ªâ tr·∫£ v·ªÅ c√¢u m·ªõi n·∫øu n√≥ th·ª±c s·ª± kh√°c c√¢u g·ªëc
            if back_translated and back_translated.lower() != original_sentence.lower():
                return back_translated

            # N·∫øu kh√¥ng c√≥ g√¨ thay ƒë·ªïi, ch√∫ng ta kh√¥ng c·∫ßn c√¢u n√†y, tr·∫£ v·ªÅ None ƒë·ªÉ b·ªè qua
            return None

        except Exception as e:
            # N·∫øu c√≥ l·ªói, ch·ªù m·ªôt ch√∫t r·ªìi th·ª≠ l·∫°i
            print(f"‚ö†Ô∏è L·ªói d·ªãch (l·∫ßn {i+1}/{retries}): {e}. ƒêang th·ª≠ l·∫°i sau {backoff_in_seconds} gi√¢y...")
            time.sleep(backoff_in_seconds)

    # Tr·∫£ v·ªÅ None n·∫øu th·∫•t b·∫°i sau t·∫•t c·∫£ c√°c l·∫ßn th·ª≠
    return None


# ==========================================================
# H√ÄM TƒÇNG C∆Ø·ªúNG D·ªÆ LI·ªÜU SONG SONG (PARALLEL AUGMENTATION)
# ==========================================================
def augment_data_parallel(input_path: str, message_col: str, label_col: str, max_workers: int = 64) -> pd.DataFrame:
    """
    T·∫£i d·ªØ li·ªáu, th·ª±c hi·ªán augmentation song song v√† tr·∫£ v·ªÅ DataFrame ƒë√£ c√¢n b·∫±ng.
    """
    print("üöÄ B·∫Øt ƒë·∫ßu qu√° tr√¨nh tƒÉng c∆∞·ªùng d·ªØ li·ªáu...")
    df = pd.read_csv(input_path, encoding='latin1') # D√πng encoding latin1 ph·ªï bi·∫øn cho dataset spam

    # ƒê·ªïi t√™n c·ªôt cho nh·∫•t qu√°n n·∫øu c·∫ßn
    df = df.rename(columns={df.columns[0]: label_col, df.columns[1]: message_col})
    df = df[[label_col, message_col]].dropna()

    print("\nPh√¢n ph·ªëi nh√£n ban ƒë·∫ßu:")
    print(df[label_col].value_counts())

    label_counts = Counter(df[label_col])
    major_class_label, major_count = label_counts.most_common(1)[0]
    minor_class_label, minor_count = label_counts.most_common()[-1]

    if major_count == minor_count:
        print("\n‚úÖ D·ªØ li·ªáu ƒë√£ c√¢n b·∫±ng. Kh√¥ng c·∫ßn augmentation.")
        return df

    num_to_generate = major_count - minor_count
    minority_messages = df[df[label_col] == minor_class_label][message_col].tolist()

    print(f"\nC·∫ßn t·∫°o th√™m {num_to_generate} m·∫´u cho l·ªõp '{minor_class_label}'.")

    augmented_results = []

    # S·ª≠ d·ª•ng ThreadPoolExecutor ƒë·ªÉ ch·∫°y song song v·ªõi thanh ti·∫øn tr√¨nh tqdm
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # T·∫°o danh s√°ch c√°c t√°c v·ª•
        futures = [executor.submit(_back_translate_robust, random.choice(minority_messages)) for _ in range(num_to_generate)]

        # Thu th·∫≠p k·∫øt qu·∫£ khi ch√∫ng ho√†n th√†nh v√† hi·ªÉn th·ªã ti·∫øn tr√¨nh
        for future in tqdm(as_completed(futures), total=num_to_generate, desc="T·ªïng h·ª£p c√¢u m·ªõi"):
            result = future.result()
            if result: # Ch·ªâ th√™m v√†o n·∫øu k·∫øt qu·∫£ kh√¥ng ph·∫£i None
                augmented_results.append(result)

    print(f"\n‚ú® ƒê√£ t·∫°o th√†nh c√¥ng {len(augmented_results)} c√¢u m·ªõi ƒë·ªôc nh·∫•t.")

    # T·∫°o DataFrame t·ª´ d·ªØ li·ªáu m·ªõi v√† k·∫øt h·ª£p v·ªõi d·ªØ li·ªáu g·ªëc
    df_new = pd.DataFrame({
        label_col: [minor_class_label] * len(augmented_results),
        message_col: augmented_results
    })

    df_augmented = pd.concat([df, df_new], ignore_index=True)

    # Tr·ªôn ng·∫´u nhi√™n d·ªØ li·ªáu
    df_augmented = df_augmented.sample(frac=1).reset_index(drop=True)

    return df_augmented

print("‚úÖ C√°c h√†m t·ªëi ∆∞u ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a.")

In [None]:
# Cell 4: Execute Augmentation and Save
# S·ª≠ d·ª•ng Google Colab ƒë·ªÉ load nhanh d·ªØ li·ªáu

INPUT_FILE_PATH = '/content/data/spam.csv'
OUTPUT_FILE_PATH = '/content/spam_augmented.csv'

# --- B·∫Øt ƒë·∫ßu ƒëo th·ªùi gian ---
start_time = time.time()

# G·ªçi h√†m ch√≠nh ƒë·ªÉ th·ª±c hi·ªán c√¥ng vi·ªác
df_final = augment_data_parallel(
    input_path=INPUT_FILE_PATH,
    message_col='Message', # T√™n c·ªôt tin nh·∫Øn
    label_col='Category'   # T√™n c·ªôt nh√£n
)

# --- K·∫øt th√∫c ƒëo th·ªùi gian ---
end_time = time.time()
elapsed_time = end_time - start_time

print("\n" + "="*50)
print("üèÅ QU√Å TR√åNH HO√ÄN T·∫§T üèÅ")
print(f"T·ªïng th·ªùi gian th·ª±c thi: {elapsed_time:.2f} gi√¢y")
print("="*50)

print("\nPh√¢n ph·ªëi nh√£n sau khi augmentation:")
print(df_final['Category'].value_counts())

# L∆∞u file k·∫øt qu·∫£
df_final.to_csv(OUTPUT_FILE_PATH, index=False, encoding='utf-8')
print(f"\n‚úÖ D·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o file: {OUTPUT_FILE_PATH}")

In [None]:
# Cell 5: Download the Result
# s·ª≠ d·ª•ng google colab ƒë·ªÉ load nhanh d·ªØ li·ªáu
# from google.colab import files

'''try:
    files.download(OUTPUT_FILE_PATH)
except FileNotFoundError:
    print(f"L·ªói: Kh√¥ng t√¨m th·∫•y file '{OUTPUT_FILE_PATH}'. H√£y ch·∫Øc ch·∫Øn r·∫±ng √¥ tr∆∞·ªõc ƒë√£ ch·∫°y th√†nh c√¥ng.")'''