In [1]:
import pandas as pd
import re
import unicodedata
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
df = pd.read_excel("../dataset/data_test_indobert_if.xlsx")

In [4]:
review_period_columns = []
for col in df.columns:
    # Cek apakah kolom mengandung format tanggal atau periode
    if any(char in str(col) for char in ['/', '-']) and any(char.isdigit() for char in str(col)):
        review_period_columns.append(col)
    # Atau jika kolom mengandung kata kunci periode
    elif any(keyword in str(col).lower() for keyword in ['periode', 'tanggal', 'bulan', 'tahun']):
        review_period_columns.append(col)

print(f"Kolom periode ulasan yang terdeteksi: {review_period_columns}")

Kolom periode ulasan yang terdeteksi: ['06/01/2020 - 12/01/2020', '13/01/2020 - 19/01/2020', '20/01/2020 - 26/01/2020', '27/01/2020 - 02/02/2020', '03/02/2020 - 09/02/2020', '10/02/2020 - 16/02/2020', '17/02/2020 - 23/02/2020', '24/02/2020 - 01/03/2020', '02/03/2020 - 08/03/2020', '09/03/2020 - 15/03/2020', '16/03/2020 - 22/03/2020', '23/03/2020 - 29/03/2020', '30/03/2020 - 05/04/2020', '06/04/2020 - 12/04/2020', '13/04/2020 - 19/04/2020', '20/04/2020 - 26/04/2020', '27/04/2020 - 03/05/2020', '04/05/2020 - 10/05/2020', '11/05/2020 - 17/05/2020', '18/05/2020 - 24/05/2020', '25/05/2020 - 31/05/2020', '01/06/2020 - 07/06/2020', '08/06/2020 - 14/06/2020', '15/06/2020 - 21/06/2020', '22/06/2020 - 28/06/2020', '29/06/2020 - 05/07/2020', '06/07/2020 - 12/07/2020', '13/07/2020 - 19/07/2020', '20/07/2020 - 26/07/2020', '27/07/2020 - 02/08/2020', '03/08/2020 - 09/08/2020', '10/08/2020 - 16/08/2020', '17/08/2020 - 23/08/2020', '24/08/2020 - 30/08/2020', '31/08/2020 - 06/09/2020', '07/09/2020 - 13

DATA INTEGRATION

In [5]:
df["text"] = df["Nama Produk"].str.cat(df["Deskripsi Produk"], sep=" ", na_rep="")

df = df.drop(columns=['Nama Produk', 'Deskripsi Produk'])

kolom_baru = ['text'] + [col for col in df.columns if col != 'text']
df = df[kolom_baru]

# df.to_csv("data_integration_test.csv", index=False, encoding='utf-8-sig', sep=';')
# print("File data_integration_test.csv berhasil disimpan")

print("\nPreview data:")
preview_columns = ["text"] + review_period_columns[:3] 
print(df[preview_columns].head())



Preview data:
                                                text  06/01/2020 - 12/01/2020  \
0  OBH Surya Itrasal 100 ml - Obat Batuk Hitam In...                        0   
1  OBH Combi Batuk Berdahak Menthol 100Ml OBH Com...                        0   
2  DEGIROL LOZ STRIP 10 TABLET / OBAT RADANG MULU...                        0   
3  ENT Clear powder Packet (20 sachet) Produk ori...                        0   
4  Degirol Tablet Hisap (10 Strip/1 Box) DEGIROL ...                        0   

   13/01/2020 - 19/01/2020  20/01/2020 - 26/01/2020  
0                        0                        0  
1                        0                        0  
2                        0                        0  
3                        0                        0  
4                        0                        0  


DATA CLEANING

In [6]:
def clean_text(text):
    text = str(text)
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\x00-\x7F]", " ", text)
    text = re.sub(r"[=]{2,}", " ", text)
    text = re.sub(r"[!#*~]{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

In [7]:
print("Memulai pembersihan... Menampilkan maksimal 5 baris pertama yang berubah:\n")

preview_count = 0
preview_limit = 5 

for index, row in df.iterrows():
    text_original = row['text']
    text_cleaned = clean_text(text_original)

    df.loc[index, 'text'] = text_cleaned

    if text_original != text_cleaned and preview_count < preview_limit:
        removed_chars = set(text_original) - set(text_cleaned)
        chars_to_show = [char for char in removed_chars if not char.isspace()]
        
        if chars_to_show:
            sample_chars = chars_to_show[:3]
            print(f"Baris ke-{index}: Dihapus karakter seperti -> {', '.join(sample_chars)}")
        else:
            print(f"Baris ke-{index}: Dihapus spasi/baris baru berlebih")
            
        preview_count += 1

print("\n✅ Proses pembersihan seluruh data selesai.")
print("\nDataFrame setelah dibersihkan:")
print(df)

Memulai pembersihan... Menampilkan maksimal 5 baris pertama yang berubah:

Baris ke-1: Dihapus spasi/baris baru berlebih
Baris ke-3: Dihapus karakter seperti -> !
Baris ke-7: Dihapus spasi/baris baru berlebih
Baris ke-8: Dihapus spasi/baris baru berlebih
Baris ke-9: Dihapus karakter seperti -> !

✅ Proses pembersihan seluruh data selesai.

DataFrame setelah dibersihkan:
                                                   text  \
0     OBH Surya Itrasal 100 ml - Obat Batuk Hitam In...   
1     OBH Combi Batuk Berdahak Menthol 100Ml OBH Com...   
2     DEGIROL LOZ STRIP 10 TABLET / OBAT RADANG MULU...   
3     ENT Clear powder Packet (20 sachet) Produk ori...   
4     Degirol Tablet Hisap (10 Strip/1 Box) DEGIROL ...   
...                                                 ...   
9379  Natur-E White 16 Kapsul - Vitamin Kulit & Kuku...   
9380  Oxyvit ISI 30 - Vitamin C++ USA Grade 500mg, V...   
9381  ALLWISE VITAMIN D3 K2 IU 5000 isi 120 Capsule ...   
9382  Vitacimin C (2pcs/1strip) FREE 

In [8]:
# Hapus duplikat
before = len(df)
df = df.drop_duplicates(subset=["text"])
after = len(df)

print(f"Jumlah data sebelum: {before}, sesudah: {after}")

Jumlah data sebelum: 9384, sesudah: 6605


In [9]:
df.dropna(subset=['text'], inplace=True)
df = df[df['text'].str.lower() != 'nan']

# df.to_csv("data_cleaned_test_noduplikat.csv", index=False, encoding='utf-8-sig', sep=';')
# print("File data_cleaned_test_noduplikat.csv berhasil disimpan")

print(f"total data: {len(df)}")

total data: 6605


In [10]:
for col in review_period_columns:
    df[col] = df[col].fillna(0)
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

In [12]:
df.to_csv("../dataset/preprocessed_indobert_data_test.csv", index=False, sep=';')