In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
df = pd.read_csv("../output/kuliner_preprocessed.csv")

print(df.shape)
df.head()


(45, 3)


Unnamed: 0,restaurant,area,clean_review
0,Ayam Goreng Jawa Mbah Cemplung,Jogja,ayam goreng jawa mbah cemplung kenal rasa khas...
1,Bale Raos - The Sultan's Dishes,Jogja,pertama kali cobain sini tmptnya luas banget s...
2,Boyong Resto,Jogja,tempat bagus bgttt makan minum juara parkir lu...
3,Gudeg Bu Djuminten,Jogja,best gudeg for me i m not a local and i always...
4,Gudeg Sagan,Jogja,cita rasa gudeg gudeg terlalu manis karna gude...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # UNIGRAM + BIGRAM
    min_df=2,             # buang term terlalu jarang
    max_df=0.9            # buang term terlalu umum
)

tfidf_matrix = vectorizer.fit_transform(df["clean_review"])

tfidf_matrix.shape


(45, 16736)

In [13]:
features = vectorizer.get_feature_names_out()

[f for f in features if "nasi goreng" in f][:10]


['nasi goreng']

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

query = "nasi goreng"
query_vec = vectorizer.transform([query])

scores = cosine_similarity(query_vec, tfidf_matrix)[0]

df["cosine_score"] = scores


In [15]:
df.sort_values("cosine_score", ascending=False)[
    ["restaurant", "area", "cosine_score"]
].head(10)


Unnamed: 0,restaurant,area,cosine_score
25,Nasi Goreng Jack Kosem,Semarang,0.268956
44,Wapo Resto,Surabaya,0.24566
21,Kedai Amarta,Semarang,0.075821
40,Soto Ayam Pak Djayus,Surabaya,0.056283
22,Kedai Beringin,Semarang,0.041816
43,Sriwedari,Surabaya,0.039223
36,Mahameru Restaurant,Surabaya,0.034025
9,Pawon Tempuran,Jogja,0.034009
12,Toean Watiman,Jogja,0.029204
14,Waroeng Tengkleng Gadjah,Jogja,0.020608


In [16]:
def search_restaurant(query, area=None, top_n=5):
    query = query.lower()
    query_vec = vectorizer.transform([query])

    similarities = cosine_similarity(query_vec, tfidf_matrix)[0]

    df_result = df.copy()
    df_result["score"] = similarities

    if area:
        df_result = df_result[df_result["area"].str.lower() == area.lower()]

    df_result = df_result.sort_values("score", ascending=False)

    return df_result[["restaurant", "area", "score"]].head(top_n)


In [18]:
search_restaurant(
    query="ayam goreng",
    area="Semarang",
    top_n=5
)


Unnamed: 0,restaurant,area,score
17,Ayam Goreng & Sop Buntut Daging Pak Supar,Semarang,0.397738
26,Pesta Kebeon,Semarang,0.020606
22,Kedai Beringin,Semarang,0.017379
16,Asem-Asem Koh Liem,Semarang,0.016971
23,Nasi Ayam Bu Pini,Semarang,0.015109
