# Import Library & Setup

In [1]:
from google_play_scraper import Sort, reviews
import pandas as pd
import numpy as np

# script configuration
APP_ID = 'com.shopee.id'
LANG = 'id'
COUNTRY = 'id'
COUNT = 20000

# Scraping
Fetching the latest reviews (Sort.NEWEST), so that the data is relevant to the latest app updates.

In [2]:
print(f"Starting to scrape {COUNT} reviews for {APP_ID}...")

# review() returns a list of reviews & a continuation_token
result, continuation_token = reviews(
    APP_ID,
    lang=LANG,
    country=COUNTRY,
    sort=Sort.NEWEST,
    count=COUNT,
    filter_score_with=None # take all reviews (1-5 stars)
)

print(f"Successfully retrieved {len(result)} reviews.")

Starting to scrape 20000 reviews for com.shopee.id...
Successfully retrieved 20000 reviews.


# Transform to DataFrame
The scraped data is a list of dictionaries (in JSON format). Therefore, it must be converted to a table (DataFrame) to be processed by Pandas.

In [3]:
# transform to df
df = pd.DataFrame(result)

# display data sample
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,805f043e-42fc-4cfb-ae27-f13661084723,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,padahal aku nyamaaan banget belaja disini tapi...,2,0,3.62.40,2025-11-27 19:34:34,"Hi kak Wan Rubaiyah, maaf ya sudah buat kmu ga...",2025-11-27 21:45:53,3.62.40
1,60189ea2-d603-43e0-81e4-415ff621dce2,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,bagus bangat,5,0,3.62.38,2025-11-27 19:34:17,"Hi kak Jejen Suhaimi, makasih banget yaa buat ...",2025-11-27 21:42:28,3.62.38
2,4a9d6bf6-dfcd-4b3b-8bd5-88705985fd61,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"semua oke sih, cuma saat saya mau ngasih ulasa...",5,0,,2025-11-27 19:34:10,"Hi kak Zukatoji Kun, maaf ya sudah buat kmu ga...",2025-11-27 21:45:15,
3,3c985e0f-60a8-403e-823d-b28794d6b579,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mudah simpel cepat,5,0,,2025-11-27 19:34:05,"Hi kak akung sakiena sakiena, makasih banget y...",2025-11-27 21:40:02,
4,6928cd79-bb45-42dc-964c-c0e305a262b8,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"maaf untuk sementara saya kasih bintang 1, kar...",1,0,3.62.38,2025-11-27 19:33:30,"Hi there, thank you for the 5 star review, sta...",2024-10-07 21:10:22,3.62.38


# Column Filter & Data Preview
Select the relevant column
- 'content': Review text
- 'score': Star rating
- 'at': Review date

In [4]:
df_filtered = df[['content', 'score', 'at']]

# rename columns for better understanding
df_filtered = df_filtered.rename(columns={
    'content': 'review_text',
    'score': 'rating',
    'at': 'review_date'
})

# check data information (data types and missing values)
print(df_filtered.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_text  20000 non-null  object        
 1   rating       20000 non-null  int64         
 2   review_date  20000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 468.9+ KB
None


# Save to CSV (Raw Data)

In [5]:
import os

output_path = '../data/raw/shopee_reviews_raw.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# save to .csv without index
df_filtered.to_csv(output_path, index=False)

print(f"Data saved to: {output_path}")

Data saved to: ../data/raw/shopee_reviews_raw.csv
