# Scrapping Data Ulasan Mobile Legends

## Import Library yang dibutuhkan

In [1]:
# Mengimpor pustaka google_play_scraper untuk mengakses ulasan dan informasi aplikasi dari Google Play Store.
from google_play_scraper import app, reviews, Sort
import time
import pandas as pd  # Pandas untuk manipulasi dan analisis data
pd.options.mode.chained_assignment = None  # Menonaktifkan peringatan chaining

## Scrapping Data

In [3]:
# Mengambil semua ulasan dari aplikasi dengan ID 'com.byu.id' di Google Play Store.
# Proses scraping mungkin memerlukan beberapa saat tergantung pada jumlah ulasan yang ada.
all_reviews = []
continuation_token = None  # Token untuk melanjutkan dari batch sebelumnya
total_target = 15000  # Target jumlah ulasan
batch_size = 200      # Maksimal per request

print("Mulai scraping...")

while len(all_reviews) < total_target:
    print(f"Batch {len(all_reviews)//batch_size + 1} - Total: {len(all_reviews)} ulasan")

    result, continuation_token = reviews(
        app_id= 'com.mobile.legends',   # ID aplikasi
        lang= 'en',                     # Bahasa Indonesia
        country= 'us',                  # Lokasi Indonesia
        sort= Sort.MOST_RELEVANT,       # Urutan ulasan (Paling Relevan)
        count= batch_size,              # Jumlah ulasan per batch
        continuation_token= continuation_token,       # Token untuk melanjutkan dari batch sebelumnya
        filter_score_with= None         # Filter berdasarkan skor (None untuk semua)
    )

    all_reviews.extend(result)

    if continuation_token is None:
        print("Tidak ada ulasan lagi yang bisa diambil.")
        break

    time.sleep(1)  # Biar nggak terlalu cepat request-nya

# Simpan ke DataFrame
df = pd.DataFrame(all_reviews)
df.to_csv('ulasan_mobile_legends_15k.csv', index=False, encoding='utf-8-sig')
print(f"Scraping selesai! Total ulasan yang didapat: {len(df)}")


Mulai scraping...
Batch 1 - Total: 0 ulasan
Batch 2 - Total: 200 ulasan
Batch 3 - Total: 400 ulasan
Batch 4 - Total: 600 ulasan
Batch 5 - Total: 800 ulasan
Batch 6 - Total: 1000 ulasan
Batch 7 - Total: 1200 ulasan
Batch 8 - Total: 1400 ulasan
Batch 9 - Total: 1600 ulasan
Batch 10 - Total: 1800 ulasan
Batch 11 - Total: 2000 ulasan
Batch 12 - Total: 2200 ulasan
Batch 13 - Total: 2400 ulasan
Batch 14 - Total: 2600 ulasan
Batch 15 - Total: 2800 ulasan
Batch 16 - Total: 3000 ulasan
Batch 17 - Total: 3200 ulasan
Batch 18 - Total: 3400 ulasan
Batch 19 - Total: 3600 ulasan
Batch 20 - Total: 3800 ulasan
Batch 21 - Total: 4000 ulasan
Batch 22 - Total: 4200 ulasan
Batch 23 - Total: 4400 ulasan
Batch 24 - Total: 4600 ulasan
Batch 25 - Total: 4800 ulasan
Batch 26 - Total: 5000 ulasan
Batch 27 - Total: 5200 ulasan
Batch 28 - Total: 5400 ulasan
Batch 29 - Total: 5600 ulasan
Batch 30 - Total: 5800 ulasan
Batch 31 - Total: 6000 ulasan
Batch 32 - Total: 6200 ulasan
Batch 33 - Total: 6400 ulasan
Batch 34

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              15000 non-null  object        
 1   userName              15000 non-null  object        
 2   userImage             15000 non-null  object        
 3   content               15000 non-null  object        
 4   score                 15000 non-null  int64         
 5   thumbsUpCount         15000 non-null  int64         
 6   reviewCreatedVersion  12722 non-null  object        
 7   at                    15000 non-null  datetime64[ns]
 8   replyContent          353 non-null    object        
 9   repliedAt             353 non-null    datetime64[ns]
 10  appVersion            12722 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 1.3+ MB


In [5]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,7c6c4516-bb9a-4106-a518-ebdceb36dade,Phillip Fiorentino,https://play-lh.googleusercontent.com/a-/ALV-U...,Basically a better and more intuitive version ...,2,5,1.9.64.10601,2025-04-16 11:10:47,,NaT,1.9.64.10601
1,fd69039f-0df1-4fd1-9a7c-efaae32c7f1d,DarkPot8o,https://play-lh.googleusercontent.com/a-/ALV-U...,I like the game. It's programmed well and bugs...,2,214,1.9.64.10601,2025-04-08 21:10:01,,NaT,1.9.64.10601
2,05575c33-897d-4338-b874-c83ac2238e72,Julianna Vang,https://play-lh.googleusercontent.com/a-/ALV-U...,FIX MATCHMAKING. I cannot stand the one-sided ...,1,225,1.9.64.10601,2025-03-26 11:51:50,,NaT,1.9.64.10601
3,fe0aeedd-8fc7-45ac-9ce1-18cb3860f57d,Usoro A.,https://play-lh.googleusercontent.com/a-/ALV-U...,The old Legend skins are completely outdated a...,2,1143,1.9.64.10601,2025-04-01 14:11:28,,NaT,1.9.64.10601
4,8d7bc23f-d7b4-45b0-9809-ade1446d4f71,Joey San Miguel,https://play-lh.googleusercontent.com/a-/ALV-U...,"During matches, I consistently encounter a ""re...",1,1156,1.9.47.10372,2025-04-11 11:12:36,,NaT,1.9.47.10372


In [6]:
# Drop kolom yang tidak diperlukan
df = df.drop(columns=['reviewId', 'userName', 'userImage', 'replyContent', 'repliedAt', 'reviewCreatedVersion', 'appVersion', 'thumbsUpCount', 'at'])

In [7]:
df.head()

Unnamed: 0,content,score
0,Basically a better and more intuitive version ...,2
1,I like the game. It's programmed well and bugs...,2
2,FIX MATCHMAKING. I cannot stand the one-sided ...,1
3,The old Legend skins are completely outdated a...,2
4,"During matches, I consistently encounter a ""re...",1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  15000 non-null  object
 1   score    15000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 234.5+ KB


In [9]:
import os

# Pastikan folder 'data' ada
os.makedirs('data', exist_ok=True)

# Simpan hasil ke file CSV yang sudah dibersihkan
df.to_csv('data/data.csv', index=False, encoding='utf-8-sig')
print("Kolom yang tidak diperlukan sudah di-drop! Dataset disimpan sebagai 'data/data.csv'")

Kolom yang tidak diperlukan sudah di-drop! Dataset disimpan sebagai 'data/data.csv'
