In [2]:
import os
import re
import gc
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
# ==========================================
# 1. SETUP MODEL
# ==========================================
# Menggunakan Float16 agar 2x lebih cepat dan hemat memori
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Menyiapkan Engine di: {device.upper()}")

MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model dengan presisi rendah (FP16) untuk kecepatan maksimal
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME, 
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

print("Model Siap!")

Menyiapkan Engine di: CUDA


`torch_dtype` is deprecated! Use `dtype` instead!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Exception ignored in: <function tqdm.__del__ at 0x000002ABA0711A80>
Traceback (most recent call last):
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


Model Siap!


In [9]:
# ==========================================
# 2. KONFIGURASI PATH
# ==========================================
RAW_PATH = "../DatasetHotel/KOMPETITORB5"
CLEAN_PATH = "../DatasetHotelCLEAN/KOMPETITORB5"
os.makedirs(CLEAN_PATH, exist_ok=True)

In [10]:
# Kamus Slang (Tetap sama)
slang_dict = {
    'yg': 'yang', 'ga': 'tidak', 'gak': 'tidak', 'nggak': 'tidak',
    'tp': 'tapi', 'krn': 'karena', 'utk': 'untuk', 'sdh': 'sudah',
    'udh': 'sudah', 'blm': 'belum', 'dgn': 'dengan', 'dlm': 'dalam',
    'bgt': 'banget', 'tdk': 'tidak', 'jgn': 'jangan', 'krg': 'kurang',
    'sy': 'saya', 'ak': 'aku', 'kalo': 'kalau', 'kl': 'kalau',
    'dr': 'dari', 'bs': 'bisa', 'kmn': 'kemana', 'tmn': 'teman',
    'bgs': 'bagus', 'dtg': 'datang', 'br': 'baru', 'ok': 'oke',
    'thx': 'terima kasih', 'makasih': 'terima kasih', 'tks': 'terima kasih',
    'min': 'minus', 'plus': 'tambah', 'chek': 'check', 'chekout': 'check out',
    'chekin': 'check in', 'in': 'masuk', 'out': 'keluar', 'pas': 'saat',
    'pd': 'pada', 'pake': 'pakai', 'sm': 'sama', 'lbh': 'lebih',
    'bkn': 'bukan', 'spt': 'seperti', 'jd': 'jadi', 'aja': 'saja',
    'aj': 'saja', 'kmr': 'kamar', 'kmar': 'kamar', 'mnt': 'minta',
    'dl': 'dulu', 'skrg': 'sekarang', 'dg': 'dengan', 'yk': 'yogyakarta'
}

In [11]:
def clean_text_advanced(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    text = ' '.join(normalized_words)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [12]:
# Fungsi Translate 
def translate_fast(text_list, batch_size=64):
    results = []
    tokenizer.src_lang = "eng_Latn"
    forced_bos_token_id = tokenizer.convert_tokens_to_ids("ind_Latn")
    
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        
        # Max length 128 cukup untuk ulasan hotel, 256 bikin lambat
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        
        with torch.no_grad():
            translated_tokens = model.generate(
                **inputs, 
                forced_bos_token_id=forced_bos_token_id, 
                max_new_tokens=128,
                num_beams=1,        # Greedy Search (Kunci Kecepatan!)
                do_sample=False
            )
        
        decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
        results.extend(decoded)
        
        # Hapus sisa batch segera
        del inputs, translated_tokens
    
    return results

In [13]:
# ==========================================
# 3. EKSEKUSI
# ==========================================
BATCH_SIZE = 64 # Bisa 64 karena pakai FP16 (lebih ringan)

files = [f for f in os.listdir(RAW_PATH) if f.endswith(".csv")]
print(f"\nMulai Memproses {len(files)} File...\n")

for idx, filename in enumerate(files):
    print(f"[{idx+1}/{len(files)}] {filename}...", end=" ")
    
    try:
        input_path = os.path.join(RAW_PATH, filename)
        df = pd.read_csv(input_path)

        # Cleaning
        df = df.dropna(subset=['Review Text'])
        blacklist = ['N/A','n/a', 'na', 'nan', '-', '', ' ', 'null']
        df = df[~df['Review Text'].astype(str).str.lower().str.strip().isin(blacklist)]
        df = df.drop(columns=[c for c in ['No', 'Review Count'] if c in df.columns], errors='ignore')
        
        df['Review Text Cleaned'] = df['Review Text'].apply(clean_text_advanced)
        df = df[df['Review Text Cleaned'].str.len() > 2]

        # Translate
        texts = df['Review Text Cleaned'].tolist()
        if len(texts) > 0:
            # Tidak pakai tqdm di dalam sini biar log bersih dan cepat
            translated_results = []
            for i in range(0, len(texts), BATCH_SIZE):
                batch = texts[i:i+BATCH_SIZE]
                translated_results.extend(translate_fast(batch, BATCH_SIZE))
            
            df['Review Text Cleaned'] = translated_results
            
            # Save
            clean_filename = filename.replace(".csv", "_Clean.csv")
            output_path = os.path.join(CLEAN_PATH, clean_filename)
            
            df = df.drop(columns=['Review Text'], errors='ignore')
            df = df.rename(columns={'Review Text Cleaned': 'Review Text'})
            df.to_csv(output_path, index=False, encoding='utf-8-sig')
            
            print(f"Selesai ({len(texts)} data).")
        else:
            print("Kosong.")

    except Exception as e:
        print(f"Error: {e}")

    # CLEANUP (Hanya butuh 0.5 detik tapi menyelamatkan Anda dari macet)
    try:
        del df, texts
        if 'translated_results' in locals(): del translated_results
        gc.collect()
        torch.cuda.empty_cache()
    except: pass

print("\nSEMUA SELESAI!")


Mulai Memproses 26 File...

[1/26] BatamMarriotHotelHabourBay.csv... Selesai (428 data).
[2/26] BestWesternPremierPanbil-Batam.csv... Selesai (592 data).
[3/26] GrandAstonCityHall-Medan.csv... Selesai (704 data).
[4/26] InterContinentalBaliResort-Bali.csv... Selesai (227 data).
[5/26] InterContinentalBaliSanurResort-Bali (1).csv... Selesai (288 data).
[6/26] InterContinentalDagoPakar-Bandung.csv... Selesai (608 data).
[7/26] InterContinentalJakartaPondokIndah-Jakarta.csv... Selesai (667 data).
[8/26] InterContinentalResidenceJakartaPondokIndah-Jakarta.csv... Selesai (685 data).
[9/26] JWMarriotHotelJakarta.csv... Selesai (683 data).
[10/26] JWMarriotHotelSurabaya.csv... Selesai (609 data).
[11/26] JWMarriotMedan.csv... Selesai (704 data).
[12/26] MovenpickHoteljakartaCityCentre.csv... Selesai (495 data).
[13/26] PullmanBaliLegianBeach (1).csv... Selesai (714 data).
[14/26] PullmanBandungGrandCentral.csv... Selesai (723 data).
[15/26] PullmanJakartaCentralPark.csv... Selesai (394 data)