In [6]:
import pandas as pd
from tqdm import tqdm
import re
import os
from deep_translator import GoogleTranslator

# PATH RELATIVE 

RAW_PATH = "../DatasetHotel/KOMPETITORB3"
CLEAN_PATH = "../DatasetHotelCLEAN/KOMPETITORB3"

os.makedirs(CLEAN_PATH, exist_ok=True)

In [7]:
# KAMUS SLANG

slang_dict = {
    'yg': 'yang', 'ga': 'tidak', 'gak': 'tidak', 'nggak': 'tidak',
    'tp': 'tapi', 'krn': 'karena', 'utk': 'untuk', 'sdh': 'sudah',
    'udh': 'sudah', 'blm': 'belum', 'dgn': 'dengan', 'dlm': 'dalam',
    'bgt': 'banget', 'tdk': 'tidak', 'jgn': 'jangan', 'krg': 'kurang',
    'sy': 'saya', 'ak': 'aku', 'kalo': 'kalau', 'kl': 'kalau',
    'dr': 'dari', 'bs': 'bisa', 'kmn': 'kemana', 'tmn': 'teman',
    'bgs': 'bagus', 'dtg': 'datang', 'br': 'baru', 'ok': 'oke',
    'thx': 'terima kasih', 'makasih': 'terima kasih', 'tks': 'terima kasih',
    'min': 'minus', 'plus': 'tambah', 'chek': 'check', 'chekout': 'check out',
    'chekin': 'check in', 'in': 'masuk', 'out': 'keluar', 'pas': 'saat',
    'pd': 'pada', 'pake': 'pakai', 'sm': 'sama', 'lbh': 'lebih',
    'bkn': 'bukan', 'spt': 'seperti', 'jd': 'jadi', 'aja': 'saja',
    'aj': 'saja', 'kmr': 'kamar', 'kmar': 'kamar', 'mnt': 'minta',
    'dl': 'dulu', 'skrg': 'sekarang',
    'dg': 'dengan', 'yk': 'yogyakarta'
}

In [8]:
# =========================
# FUNGSI CLEANING
# =========================
def clean_text_advanced(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    text = ' '.join(normalized_words)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [9]:
# =========================
# FUNGSI TRANSLATE
# =========================
def translate_to_indonesia(text):
    if not text or len(text) < 3:
        return text
    try:
        translated = GoogleTranslator(source='auto', target='id').translate(text)
        return translated
    except Exception:
        return text

In [10]:
# =========================
# PROSES SEMUA FILE DI RAW
# =========================
tqdm.pandas(desc="Menerjemahkan ke Indonesia")

for filename in os.listdir(RAW_PATH):
    if filename.endswith(".csv"):
        input_path = os.path.join(RAW_PATH, filename)
        df = pd.read_csv(input_path)

        # Hapus yang benar-benar kosong (NaN sistem)
        df = df.dropna(subset=['Review Text'])
        
        # Hapus yang teksnya "N/A", "-", "null", dsb
        blacklist = ['N/A','n/a', 'na', 'nan', '-', '', ' ', 'null']
        df = df[~df['Review Text'].astype(str).str.lower().str.strip().isin(blacklist)]
        # -----------------------------------------------------------

        # Drop kolom tidak perlu jika ada
        df = df.drop(columns=[c for c in ['No', 'Review Time'] if c in df.columns])

        # Cleaning
        df['Review Text Cleaned'] = df['Review Text'].apply(clean_text_advanced)

        # Translate
        df['Review Text Cleaned'] = df['Review Text Cleaned'].progress_apply(translate_to_indonesia)

        # Finalisasi
        df = df.drop(columns=['Review Text'])
        df = df.rename(columns={'Review Text Cleaned': 'Review Text'})
        df = df.drop_duplicates().dropna(subset=['Review Text'])

        # Simpan dengan format NamaFile_Clean.csv
        clean_filename = filename.replace(".csv", "_Clean.csv")
        output_path = os.path.join(CLEAN_PATH, clean_filename)

        df.to_csv(output_path, index=False, encoding='utf-8-sig')

        print(f"File diproses  : {filename}")
        print(f"File disimpan  : {clean_filename}")
        print("-" * 50)

print("Semua file berhasil diproses dan disimpan")

Menerjemahkan ke Indonesia: 100%|██████████| 844/844 [03:20<00:00,  4.20it/s]


File diproses  : AloftJakartaWahidHasyim.csv
File disimpan  : AloftJakartaWahidHasyim_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 821/821 [03:06<00:00,  4.41it/s]


File diproses  : HolidayInnExpressJakartaPluitCityGate.csv
File disimpan  : HolidayInnExpressJakartaPluitCityGate_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 855/855 [03:18<00:00,  4.31it/s]


File diproses  : HolidayInnExpressJakartaWahidhasyim.csv
File disimpan  : HolidayInnExpressJakartaWahidhasyim_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 930/930 [03:10<00:00,  4.87it/s]


File diproses  : HolidayInnExpressSemarangSimpangLima.csv
File disimpan  : HolidayInnExpressSemarangSimpangLima_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 932/932 [03:27<00:00,  4.49it/s]

File diproses  : IbisStylesYogyakarta (1).csv
File disimpan  : IbisStylesYogyakarta (1)_Clean.csv
--------------------------------------------------
Semua file berhasil diproses dan disimpan



