In [None]:
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
import google_play_scraper

In [None]:
app_id = 'blibli.mobile.commerce'

In [None]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [None]:
reviews_count = 10000

In [None]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='id', #The language of review
            country='id', #Country for which you want to scrape
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

10149it [00:09, 1047.31it/s]


In [None]:
import numpy as np
df = pd.DataFrame(np.array(result),columns=['review'])

df = df.join(pd.DataFrame(df.pop('review').tolist()))

df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,6be275d9-9d29-42d3-a807-4ba33a618289,Suwarno Suwarno,https://play-lh.googleusercontent.com/a/ACg8oc...,"Aplikasi mudah di hack, saya kena dgn pembelia...",1,0,11.2.1,2024-06-14 15:44:33,Halo kak Suwarno mohon maaf terkait Kendalanya...,2024-06-14 15:51:09,11.2.1
1,f80086ea-6342-4d73-91d3-5a6f011b478b,WAHYU AMIJAYA,https://play-lh.googleusercontent.com/a-/ALV-U...,top,5,0,11.2.1,2024-06-14 14:50:18,Halo kak Wahyu mantap! Makasih banyak buat rev...,2024-06-14 15:25:24,11.2.1
2,1e28c3f9-8a4a-4d70-b6c2-64f45b124a57,sri agung,https://play-lh.googleusercontent.com/a-/ALV-U...,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5,123,11.2.1,2024-06-14 13:56:20,Yeay mantul banget kan kak sri belanja di Blib...,2021-01-08 16:39:12,11.2.1
3,8efe7503-235f-441c-bd5e-d67da2cfa30c,Setio Nggoro Wati,https://play-lh.googleusercontent.com/a-/ALV-U...,is the best,5,0,11.2.1,2024-06-14 12:44:25,Halo kak Setio Nggoro Wati mantap! Makasih ban...,2024-06-14 14:58:57,11.2.1
4,7e2a2f07-0db7-4c07-b625-bc504e13fd66,Prapti Lestari,https://play-lh.googleusercontent.com/a-/ALV-U...,Paling favorit untuk belanja elektronik online...,5,0,7.6.6,2024-06-14 12:34:29,Halo kak Prapti Lestari mantap! Makasih banyak...,2024-06-14 15:01:02,7.6.6


In [None]:
len(df)

10149

In [None]:
df[['userName', 'score','at', 'content']].head()  #dari scrapping tsb didapatkan banyak sekali kolom, kemudian kolom" tsb kita filter
                                                        #sehingga didapatkan kolom username, score, at dan content

Unnamed: 0,userName,score,at,content
0,Suwarno Suwarno,1,2024-06-14 15:44:33,"Aplikasi mudah di hack, saya kena dgn pembelia..."
1,WAHYU AMIJAYA,5,2024-06-14 14:50:18,top
2,sri agung,5,2024-06-14 13:56:20,1 dekade bersama BLIBLI. Terpercaya dan semaki...
3,Setio Nggoro Wati,5,2024-06-14 12:44:25,is the best
4,Prapti Lestari,5,2024-06-14 12:34:29,Paling favorit untuk belanja elektronik online...


In [None]:
df=df[['content', 'score']]#karena kita hanya membutuhkan kolom content dan score maka kita lakukan filter kolom lgi hing
df.head()

Unnamed: 0,content,score
0,"Aplikasi mudah di hack, saya kena dgn pembelia...",1
1,top,5
2,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5
3,is the best,5
4,Paling favorit untuk belanja elektronik online...,5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10149 entries, 0 to 10148
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  10149 non-null  object
 1   score    10149 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 158.7+ KB


In [None]:
df.isna()

Unnamed: 0,content,score
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
10144,False,False
10145,False,False
10146,False,False
10147,False,False


In [None]:
df.describe()

Unnamed: 0,score
count,10149.0
mean,3.922061
std,1.644359
min,1.0
25%,3.0
50%,5.0
75%,5.0
max,5.0


In [None]:
df.isnull().sum()

content    0
score      0
dtype: int64

In [None]:
df.to_csv("hasil-scrap-blibli.csv", index = False) #simpan csv

# **PREPROCESSING**

Case Folding

In [None]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    return df

In [None]:
df['text_clean'] = df['content'].str.lower()
df['text_clean']
data_clean = clean_text(df, 'content', 'text_clean')
data_clean.head(10)

Unnamed: 0,content,score,text_clean
0,"Aplikasi mudah di hack, saya kena dgn pembelia...",1,aplikasi mudah di hack saya kena dgn pembelian...
1,top,5,top
2,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5,dekade bersama blibli terpercaya dan semakin ...
3,is the best,5,is the best
4,Paling favorit untuk belanja elektronik online...,5,paling favorit untuk belanja elektronik online...
5,sangat membantu memenuhi kebutuhan rumah,4,sangat membantu memenuhi kebutuhan rumah
6,Sdh di instal ulang masih juga gak bisa di buk...,3,sdh di instal ulang masih juga gak bisa di buk...
7,semoga selalu sukses,4,semoga selalu sukses
8,Semoga Blibli tambah maju,5,semoga blibli tambah maju
9,Ngak suka aja pakainya.,1,ngak suka aja pakainya


Stopword Removal

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('indonesian')
data_clean['text_StopWord'] = data_clean['text_clean'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head(50)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,content,score,text_clean,text_StopWord
0,"Aplikasi mudah di hack, saya kena dgn pembelia...",1,aplikasi mudah di hack saya kena dgn pembelian...,aplikasi mudah hack kena dgn pembelian dicance...
1,top,5,top,top
2,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5,dekade bersama blibli terpercaya dan semakin ...,dekade blibli terpercaya terdepan pengiriman f...
3,is the best,5,is the best,is the best
4,Paling favorit untuk belanja elektronik online...,5,paling favorit untuk belanja elektronik online...,favorit belanja elektronik online layanan ok b...
5,sangat membantu memenuhi kebutuhan rumah,4,sangat membantu memenuhi kebutuhan rumah,membantu memenuhi kebutuhan rumah
6,Sdh di instal ulang masih juga gak bisa di buk...,3,sdh di instal ulang masih juga gak bisa di buk...,sdh instal ulang gak buka aplikasinya tolong p...
7,semoga selalu sukses,4,semoga selalu sukses,semoga sukses
8,Semoga Blibli tambah maju,5,semoga blibli tambah maju,semoga blibli maju
9,Ngak suka aja pakainya.,1,ngak suka aja pakainya,ngak suka aja pakainya


**Tokenizing**


In [None]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_StopWord'].apply(lambda x: word_tokenize(x))
data_clean.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens
0,"Aplikasi mudah di hack, saya kena dgn pembelia...",1,aplikasi mudah di hack saya kena dgn pembelian...,aplikasi mudah hack kena dgn pembelian dicance...,"[aplikasi, mudah, hack, kena, dgn, pembelian, ..."
1,top,5,top,top,[top]
2,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5,dekade bersama blibli terpercaya dan semakin ...,dekade blibli terpercaya terdepan pengiriman f...,"[dekade, blibli, terpercaya, terdepan, pengiri..."
3,is the best,5,is the best,is the best,"[is, the, best]"
4,Paling favorit untuk belanja elektronik online...,5,paling favorit untuk belanja elektronik online...,favorit belanja elektronik online layanan ok b...,"[favorit, belanja, elektronik, online, layanan..."


**Stemming**

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/209.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
#-----------------STEMMING -----------------
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
hitung=0

for document in data_clean['text_tokens']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    hitung+=1
    print(hitung,":",term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]


#script ini bisa dipisah dari eksekusinya setelah pembacaaan term selesai
data_clean['text_steamindo'] = data_clean['text_tokens'].apply(lambda x:' '.join(get_stemmed_term(x)))
data_clean.head(20)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
4325 : diekse : diekse
4326 : kusinya : kus
4327 : mantapcara : mantapcara
4328 : bertransaksiy : bertransaksiy
4329 : pakek : pakek
4330 : inibarangnya : inibarangnya
4331 : kualitasnya : kualitas
4332 : pkonya : pkonya
4333 : bangget : bangget
4334 : investor : investor
4335 : originalitas : originalitas
4336 : buktiin : buktiin
4337 : kembangin : kembangin
4338 : pembaruan : baru
4339 : ribetdan : ribetdan
4340 : burukpengiriman : burukpengiriman
4341 : molorfitur : molorfitur
4342 : aplikasirendahmbeda : aplikasirendahmbeda
4343 : inisebagai : inisebagai
4344 : dirugikantidak : dirugikantidak
4345 : selayaknya : layak
4346 : lainyg : lainyg
4347 : efisienjangan : efisienjangan
4348 : salahkan : salah
4349 : forever : forever
4350 : cepatblibli : cepatblibli
4351 : okl : okl
4352 : promonyaaa : promonyaaa
4353 : makasii : makasii
4354 : bisanya : bisa
4355 : diurut : urut
4356 : disusun : susun
4357 : loss : lo

Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens,text_steamindo
0,"Aplikasi mudah di hack, saya kena dgn pembelia...",1,aplikasi mudah di hack saya kena dgn pembelian...,aplikasi mudah hack kena dgn pembelian dicance...,"[aplikasi, mudah, hack, kena, dgn, pembelian, ...",aplikasi mudah hack kena dgn beli dicancel hac...
1,top,5,top,top,[top],top
2,1 dekade bersama BLIBLI. Terpercaya dan semaki...,5,dekade bersama blibli terpercaya dan semakin ...,dekade blibli terpercaya terdepan pengiriman f...,"[dekade, blibli, terpercaya, terdepan, pengiri...",dekade blibli percaya depan kirim free ongkirn...
3,is the best,5,is the best,is the best,"[is, the, best]",is the best
4,Paling favorit untuk belanja elektronik online...,5,paling favorit untuk belanja elektronik online...,favorit belanja elektronik online layanan ok b...,"[favorit, belanja, elektronik, online, layanan...",favorit belanja elektronik online layan ok ban...
5,sangat membantu memenuhi kebutuhan rumah,4,sangat membantu memenuhi kebutuhan rumah,membantu memenuhi kebutuhan rumah,"[membantu, memenuhi, kebutuhan, rumah]",bantu penuh butuh rumah
6,Sdh di instal ulang masih juga gak bisa di buk...,3,sdh di instal ulang masih juga gak bisa di buk...,sdh instal ulang gak buka aplikasinya tolong p...,"[sdh, instal, ulang, gak, buka, aplikasinya, t...",sdh instal ulang gak buka aplikasi tolong baik
7,semoga selalu sukses,4,semoga selalu sukses,semoga sukses,"[semoga, sukses]",moga sukses
8,Semoga Blibli tambah maju,5,semoga blibli tambah maju,semoga blibli maju,"[semoga, blibli, maju]",moga blibli maju
9,Ngak suka aja pakainya.,1,ngak suka aja pakainya,ngak suka aja pakainya,"[ngak, suka, aja, pakainya]",ngak suka aja pakai


In [None]:
data_clean.to_csv('blibli.csv', index= False) #kemudian simpan hasil text preprocessing ke file csv