[link text](https://)

# **scraping data dari google maps**

In [None]:
# Scraping data dari ulasan google maps

from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json
import pandas as pd

params = {
  "api_key": "d12ad9e8a239f30e500a1c2bb18c5256fa13e5966a17c3d39f66ccfadd830085",                    # API key dari serpAPI
  "engine": "google_maps_reviews",                                                                  # serpAPI search engine
  "hl": "id",                                                                                       # language of the search
  "data_id": "0x2e7a584c3f999547:0xf5f81e9fab2e1dfb"                                                # data id yang terletak di dalam URL Tempat Google Maps:di parameter kueri `data=`
}

search = GoogleSearch(params)


reviews = []

page_num = 0
while True:
    page_num += 1
    results = search.get_dict()
    if "error" in results:
        print(f"Error dari API: {results['error']}")
        break
    print(f"Extracting reviews from {page_num} page.")

    if not "error" in results:
        for result in results.get("reviews", []): # return an empty list [] if no reviews from the place
            reviews.append({
                "page": page_num,
                "name": result.get("user").get("name"),
                "link": result.get("user").get("link"),
                "thumbnail": result.get("user").get("thumbnail"),
                "rating": result.get("rating"),
                "date": result.get("date"),
                "snippet": result.get("snippet"),
                "images": result.get("images"),
                "local_guide": result.get("user").get("local_guide"),
                # other data
            })
    else:
        print(results["error"])
        break
    pagination = results.get("serpapi_pagination", {})

    if pagination and pagination.get("next") and pagination.get("next_page_token"):
        search.params_dict.update(dict(parse_qsl(urlsplit(pagination["next"]).query)))
    else:
        break
    if results.get("serpapi_pagination").get("next") and results.get("serpapi_pagination").get("next_page_token"):
        # split URL in parts as a dict and update search "params" variable to a new page that will be passed to GoogleSearch()
        search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
        #search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
        #print(result)
    else:
        break


print(json.dumps(reviews, indent=2, ensure_ascii=False))
df = pd.DataFrame(reviews)
df.to_csv(f"data.csv", index=False)

# **import library**

In [None]:
# Import library yang digunakan

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# **load data**

In [None]:
# Baca data

df = pd.read_csv('data.csv')

df.shape

In [None]:
# Cek null

df.isnull().sum()

In [None]:
# Cek data info

df.info()

# **clean text**

In [None]:
# Clean text

nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

df['clean_snippet'] = df['snippet'].astype(str).apply(clean_text)

In [None]:
df.head()

# **TF-IDF**

In [None]:
# TF-IDF

tfidf = TfidfVectorizer(max_features=300)
X_text = tfidf.fit_transform(df['clean_snippet']).toarray()

In [None]:
# Gabungkan dengan rating

scaler = MinMaxScaler()
rating_scaled = scaler.fit_transform(df[['rating']])
X = pd.DataFrame(X_text)
X['rating'] = rating_scaled

In [None]:
# Ubah tipe data menjadi string

X.columns = X.columns.astype(str)

# **mencari K dengan Elbow**

In [None]:
# Cek K(jumlah cluster) dengan Elbow

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

# **mencari K dengan Silhouette**

In [None]:
# Cek K(jumlah cluster) dengan silhoutte

scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    scores.append(score)
    print(f"Jumlah Klaster = {k}, Silhouette Score = {score:.4f}")
plt.figure(figsize=(8,5))
plt.plot(K, scores, marker='o')
plt.title('Silhouette Score untuk berbagai jumlah klaster')
plt.xlabel('Jumlah Klaster (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# **hitung KMeans**

In [None]:
# hitung K-Means dengan K=2

k = 2
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# **Visualisasi**

In [None]:
# Visualisasi PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
df['pca1'] = X_pca[:, 0]
df['pca2'] = X_pca[:, 1]

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster', palette='Set1')
plt.title('Visualisasi Klaster (PCA)')
plt.show()