In [1]:
import pandas as pd
import re

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


In [2]:
pip install Sastrawi


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
df = pd.read_csv("../output/kuliner_merged.csv")

print(df.shape)
df.head()


(45, 4)


Unnamed: 0,restaurant,area,review,sample_reviews
0,Ayam Goreng Jawa Mbah Cemplung,Jogja,Ayam Goreng Jawa Mbah Cemplung terkenal dengan...,"[{""reviewer"": ""Keegantov Antonov"", ""text"": ""Ay..."
1,Bale Raos - The Sultan's Dishes,Jogja,"Pertama kali cobain ke sini, tmptnya luas bang...","[{""reviewer"": ""VD"", ""text"": ""Pertama kali coba..."
2,Boyong Resto,Jogja,"tempatnya bagus bgttt, makanan dan minumannya ...","[{""reviewer"": ""Jiran Utami Trisnawati"", ""text""..."
3,Gudeg Bu Djuminten,Jogja,"Best gudeg for me (I'm not a local, and I alwa...","[{""reviewer"": ""Anjani"", ""text"": ""Best gudeg fo..."
4,Gudeg Sagan,Jogja,"Untuk cita rasa gudeg, gudeg ini menurut saya ...","[{""reviewer"": ""Y. J Sarah"", ""text"": ""Untuk cit..."


In [4]:
# Stopword Indonesia
stopword_factory = StopWordRemoverFactory()
stopwords = set(stopword_factory.get_stop_words())

# Stemmer Indonesia
stemmer = StemmerFactory().create_stemmer()


In [5]:
def preprocess_text(text):
    text = text.lower()
    
    # hapus URL
    text = re.sub(r"http\S+|www\S+", " ", text)
    
    # hapus angka & simbol
    text = re.sub(r"[^a-z\s]", " ", text)
    
    # hapus spasi berlebih
    text = re.sub(r"\s+", " ", text).strip()
    
    # stopword removal
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords]
    
    # stemming
    tokens = [stemmer.stem(t) for t in tokens]
    
    return " ".join(tokens)


In [6]:
df["clean_review"] = df["review"].astype(str).apply(preprocess_text)

df[["review", "clean_review"]].head()


Unnamed: 0,review,clean_review
0,Ayam Goreng Jawa Mbah Cemplung terkenal dengan...,ayam goreng jawa mbah cemplung kenal rasa khas...
1,"Pertama kali cobain ke sini, tmptnya luas bang...",pertama kali cobain sini tmptnya luas banget s...
2,"tempatnya bagus bgttt, makanan dan minumannya ...",tempat bagus bgttt makan minum juara parkir lu...
3,"Best gudeg for me (I'm not a local, and I alwa...",best gudeg for me i m not a local and i always...
4,"Untuk cita rasa gudeg, gudeg ini menurut saya ...",cita rasa gudeg gudeg terlalu manis karna gude...


In [7]:
df_final = df[["restaurant", "area", "clean_review"]]

df_final.to_csv("../output/kuliner_preprocessed.csv", index=False)

print("Preprocessing selesai.")
print("Total dokumen:", len(df_final))


Preprocessing selesai.
Total dokumen: 45
