In [2]:
import pandas as pd
import os

BASE_DIR = "../data_raw"
MAX_REVIEW = 100
MIN_WORDS = 5


In [3]:
areas = [
    d for d in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, d))
]

print("Area ditemukan:", areas)


Area ditemukan: ['Jogja', 'Semarang', 'Surabaya']


In [4]:
import json

records = []

for area in areas:
    area_path = os.path.join(BASE_DIR, area)

    for file in os.listdir(area_path):
        if not file.endswith(".csv"):
            continue

        file_path = os.path.join(area_path, file)

        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Gagal baca {file}: {e}")
            continue

        
        required_cols = {'text', 'stars'}
        if not required_cols.issubset(df.columns):
            print(f"Kolom tidak sesuai di {file}")
            continue

        
        df = df[df['stars'] == 5]

        
        df['text'] = df['text'].astype(str)
        df = df[df['text'].str.split().str.len() >= MIN_WORDS]

        if df.empty:
            continue

        
        df_sample = df.head(MAX_REVIEW)

        
        combined_review = " ".join(df_sample['text'])

        
        sample_reviews = []
        for _, row in df_sample.iterrows():
            sample_reviews.append({
                "reviewer": row["name"] if "name" in df.columns else "Anonim",
                "text": row["text"],
                "stars": int(row["stars"])
            })

        
        records.append({
            "restaurant": file.replace(".csv", ""),
            "area": area,
            "review": combined_review,
            "sample_reviews": json.dumps(sample_reviews, ensure_ascii=False)
        })

In [5]:
df_merged = pd.DataFrame(records)

print("Total restoran:", len(df_merged))
df_merged.head()


Total restoran: 45


Unnamed: 0,restaurant,area,review,sample_reviews
0,Ayam Goreng Jawa Mbah Cemplung,Jogja,Ayam Goreng Jawa Mbah Cemplung terkenal dengan...,"[{""reviewer"": ""Keegantov Antonov"", ""text"": ""Ay..."
1,Bale Raos - The Sultan's Dishes,Jogja,"Pertama kali cobain ke sini, tmptnya luas bang...","[{""reviewer"": ""VD"", ""text"": ""Pertama kali coba..."
2,Boyong Resto,Jogja,"tempatnya bagus bgttt, makanan dan minumannya ...","[{""reviewer"": ""Jiran Utami Trisnawati"", ""text""..."
3,Gudeg Bu Djuminten,Jogja,"Best gudeg for me (I'm not a local, and I alwa...","[{""reviewer"": ""Anjani"", ""text"": ""Best gudeg fo..."
4,Gudeg Sagan,Jogja,"Untuk cita rasa gudeg, gudeg ini menurut saya ...","[{""reviewer"": ""Y. J Sarah"", ""text"": ""Untuk cit..."


In [6]:
os.makedirs("../output", exist_ok=True)
df_merged.to_csv("../output/kuliner_merged.csv", index=False)

print("Berhasil disimpan ke output/kuliner_merged.csv")


Berhasil disimpan ke output/kuliner_merged.csv
