In [8]:
import pandas as pd
from tqdm import tqdm
import re
import os
from deep_translator import GoogleTranslator

# PATH RELATIVE 

RAW_PATH = "../DatasetHotel/BUMNB3"
CLEAN_PATH = "../DatasetHotelCLEAN/BUMNB3"

os.makedirs(CLEAN_PATH, exist_ok=True)

In [9]:
# KAMUS SLANG

slang_dict = {
    'yg': 'yang', 'ga': 'tidak', 'gak': 'tidak', 'nggak': 'tidak',
    'tp': 'tapi', 'krn': 'karena', 'utk': 'untuk', 'sdh': 'sudah',
    'udh': 'sudah', 'blm': 'belum', 'dgn': 'dengan', 'dlm': 'dalam',
    'bgt': 'banget', 'tdk': 'tidak', 'jgn': 'jangan', 'krg': 'kurang',
    'sy': 'saya', 'ak': 'aku', 'kalo': 'kalau', 'kl': 'kalau',
    'dr': 'dari', 'bs': 'bisa', 'kmn': 'kemana', 'tmn': 'teman',
    'bgs': 'bagus', 'dtg': 'datang', 'br': 'baru', 'ok': 'oke',
    'thx': 'terima kasih', 'makasih': 'terima kasih', 'tks': 'terima kasih',
    'min': 'minus', 'plus': 'tambah', 'chek': 'check', 'chekout': 'check out',
    'chekin': 'check in', 'in': 'masuk', 'out': 'keluar', 'pas': 'saat',
    'pd': 'pada', 'pake': 'pakai', 'sm': 'sama', 'lbh': 'lebih',
    'bkn': 'bukan', 'spt': 'seperti', 'jd': 'jadi', 'aja': 'saja',
    'aj': 'saja', 'kmr': 'kamar', 'kmar': 'kamar', 'mnt': 'minta',
    'dl': 'dulu', 'skrg': 'sekarang',
    'dg': 'dengan', 'yk': 'yogyakarta'
}

In [10]:
# =========================
# FUNGSI CLEANING
# =========================
def clean_text_advanced(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    text = ' '.join(normalized_words)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [11]:
# =========================
# FUNGSI TRANSLATE
# =========================
def translate_to_indonesia(text):
    if not text or len(text) < 3:
        return text
    try:
        translated = GoogleTranslator(source='auto', target='id').translate(text)
        return translated
    except Exception:
        return text

In [12]:
# =========================
# PROSES SEMUA FILE DI RAW
# =========================
tqdm.pandas(desc="Menerjemahkan ke Indonesia")

for filename in os.listdir(RAW_PATH):
    if filename.endswith(".csv"):
        input_path = os.path.join(RAW_PATH, filename)
        df = pd.read_csv(input_path)

        # Hapus yang benar-benar kosong (NaN sistem)
        df = df.dropna(subset=['Review Text'])
        
        # Hapus yang teksnya "N/A", "-", "null", dsb
        blacklist = ['N/A','n/a', 'na', 'nan', '-', '', ' ', 'null']
        df = df[~df['Review Text'].astype(str).str.lower().str.strip().isin(blacklist)]
        # -----------------------------------------------------------

        # Drop kolom tidak perlu jika ada
        df = df.drop(columns=[c for c in ['No', 'Review Time'] if c in df.columns])

        # Cleaning
        df['Review Text Cleaned'] = df['Review Text'].apply(clean_text_advanced)

        # Translate
        df['Review Text Cleaned'] = df['Review Text Cleaned'].progress_apply(translate_to_indonesia)

        # Finalisasi
        df = df.drop(columns=['Review Text'])
        df = df.rename(columns={'Review Text Cleaned': 'Review Text'})
        df = df.drop_duplicates().dropna(subset=['Review Text'])

        # Simpan dengan format NamaFile_Clean.csv
        clean_filename = filename.replace(".csv", "_Clean.csv")
        output_path = os.path.join(CLEAN_PATH, clean_filename)

        df.to_csv(output_path, index=False, encoding='utf-8-sig')

        print(f"File diproses  : {filename}")
        print(f"File disimpan  : {clean_filename}")
        print("-" * 50)

print("Semua file berhasil diproses dan disimpan")

Menerjemahkan ke Indonesia: 100%|██████████| 422/422 [01:35<00:00,  4.44it/s]


File diproses  : Banaran9ResortHotel.csv
File disimpan  : Banaran9ResortHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 837/837 [02:18<00:00,  6.02it/s]


File diproses  : BrothersSolo.csv
File disimpan  : BrothersSolo_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 134/134 [00:20<00:00,  6.52it/s]


File diproses  : CodiaBanjarmasin.csv
File disimpan  : CodiaBanjarmasin_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 126/126 [00:17<00:00,  7.37it/s]


File diproses  : CordiaBanjarmasin.csv
File disimpan  : CordiaBanjarmasin_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 869/869 [02:09<00:00,  6.70it/s]


File diproses  : HAKAHotelSemarang.csv
File disimpan  : HAKAHotelSemarang_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 1000/1000 [03:10<00:00,  5.25it/s]


File diproses  : HarperJakartaMTHaryono.csv
File disimpan  : HarperJakartaMTHaryono_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 883/883 [02:26<00:00,  6.02it/s]


File diproses  : HotelGrandSurabaya.csv
File disimpan  : HotelGrandSurabaya_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 935/935 [02:29<00:00,  6.27it/s]


File diproses  : HotelRattanInn.csv
File disimpan  : HotelRattanInn_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 697/697 [02:13<00:00,  5.22it/s]


File diproses  : InnaBaliHeritageHotel.csv
File disimpan  : InnaBaliHeritageHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 757/757 [02:00<00:00,  6.28it/s]


File diproses  : InnaSindhuBeachHotel.csv
File disimpan  : InnaSindhuBeachHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 852/852 [02:12<00:00,  6.45it/s]


File diproses  : InnaTretesHotel.csv
File disimpan  : InnaTretesHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 854/854 [02:24<00:00,  5.89it/s]


File diproses  : KHASGresikHotel.csv
File disimpan  : KHASGresikHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 857/857 [02:24<00:00,  5.92it/s]


File diproses  : KHASMakassarHotel (1).csv
File disimpan  : KHASMakassarHotel (1)_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 949/949 [02:32<00:00,  6.22it/s]


File diproses  : KHASMalioboroHotel (1).csv
File disimpan  : KHASMalioboroHotel (1)_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 328/328 [01:09<00:00,  4.73it/s]


File diproses  : KHASOmbilinHotel.csv
File disimpan  : KHASOmbilinHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 91/91 [00:15<00:00,  6.04it/s]


File diproses  : KHASPaluHotel.csv
File disimpan  : KHASPaluHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 848/848 [02:15<00:00,  6.24it/s]


File diproses  : KHASPekalonganHotel.csv
File disimpan  : KHASPekalonganHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 857/857 [02:24<00:00,  5.93it/s]


File diproses  : KHASPekanbaruRiauHotel.csv
File disimpan  : KHASPekanbaruRiauHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 855/855 [02:44<00:00,  5.18it/s]


File diproses  : KHASPrapat.csv
File disimpan  : KHASPrapat_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 889/889 [02:28<00:00,  5.98it/s]


File diproses  : KHASSemarangHotel.csv
File disimpan  : KHASSemarangHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 851/851 [02:17<00:00,  6.20it/s]


File diproses  : KHASSurabayaHotel.csv
File disimpan  : KHASSurabayaHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 872/872 [02:17<00:00,  6.35it/s]


File diproses  : KHASTegalHotel.csv
File disimpan  : KHASTegalHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 1073/1073 [02:54<00:00,  6.14it/s]


File diproses  : KHASTuguHotel (1).csv
File disimpan  : KHASTuguHotel (1)_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 805/805 [02:18<00:00,  5.81it/s]


File diproses  : KyriadHotelAirportTanggerang.csv
File disimpan  : KyriadHotelAirportTanggerang_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 871/871 [02:21<00:00,  6.17it/s]


File diproses  : LafayetteHotel.csv
File disimpan  : LafayetteHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 865/865 [02:26<00:00,  5.92it/s]


File diproses  : LPPGardenHotelJogjakarta.csv
File disimpan  : LPPGardenHotelJogjakarta_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 720/720 [02:12<00:00,  5.43it/s]


File diproses  : PalmParkSurabaya.csv
File disimpan  : PalmParkSurabaya_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 836/836 [02:24<00:00,  5.80it/s]


File diproses  : ParkHotel.csv
File disimpan  : ParkHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 873/873 [02:19<00:00,  6.26it/s]


File diproses  : PatraAnerHotel.csv
File disimpan  : PatraAnerHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 843/843 [02:21<00:00,  5.96it/s]


File diproses  : PatraBandungHotel.csv
File disimpan  : PatraBandungHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 383/383 [01:02<00:00,  6.08it/s]


File diproses  : PatraDumaiHotel.csv
File disimpan  : PatraDumaiHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 410/410 [01:18<00:00,  5.20it/s]


File diproses  : PatraJakarta.csv
File disimpan  : PatraJakarta_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 721/721 [02:02<00:00,  5.88it/s]


File diproses  : PatraMalioboroHotel.csv
File disimpan  : PatraMalioboroHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 705/705 [02:17<00:00,  5.13it/s]


File diproses  : PatraParapatHotel.csv
File disimpan  : PatraParapatHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 415/415 [01:36<00:00,  4.32it/s]


File diproses  : RollasHotel&Resort.csv
File disimpan  : RollasHotel&Resort_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 903/903 [02:17<00:00,  6.55it/s]


File diproses  : TheManohara.csv
File disimpan  : TheManohara_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 868/868 [02:34<00:00,  5.62it/s]


File diproses  : ThePatraBaliResort&Villlaa.csv
File disimpan  : ThePatraBaliResort&Villlaa_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 554/554 [01:46<00:00,  5.20it/s]


File diproses  : TheSilkArt+DesignHotel.csv
File disimpan  : TheSilkArt+DesignHotel_Clean.csv
--------------------------------------------------


Menerjemahkan ke Indonesia: 100%|██████████| 869/869 [02:08<00:00,  6.77it/s]

File diproses  : Up-PeakSemarang.csv
File disimpan  : Up-PeakSemarang_Clean.csv
--------------------------------------------------
Semua file berhasil diproses dan disimpan



