In [None]:
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
import google_play_scraper

In [None]:
app_id = 'com.shopee.id'

In [None]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [None]:
reviews_count = 10000

In [None]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='id', #The language of review
            country='id', #Country for which you want to scrape
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

10149it [00:12, 783.57it/s]


In [None]:
import numpy as np
df = pd.DataFrame(np.array(result),columns=['review'])

df = df.join(pd.DataFrame(df.pop('review').tolist()))

df.head(10)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,10d15ae1-8d47-444a-a5bc-970293635873,Smile Desune,https://play-lh.googleusercontent.com/a-/ALV-U...,Sekarang nyari pods vape roko elektrik ga bisa...,1,0,,2024-06-14 15:46:08,Hi kak. Mohon maaf ya perihal produk Vape / Ro...,2024-06-14 16:16:17,
1,6ea63b22-cdbd-4807-bb19-970ac4b9b190,Ahmad Alkahfi,https://play-lh.googleusercontent.com/a-/ALV-U...,Keren,5,0,3.27.09,2024-06-14 15:43:56,Hi kak. Mimin seneng bgt nih dapet feedback po...,2024-06-14 16:12:02,3.27.09
2,71e9f2c7-4a72-4c38-b3cc-844808b3e34e,Akbar Fadilah,https://play-lh.googleusercontent.com/a-/ALV-U...,shopi mantul...,5,0,3.27.09,2024-06-14 15:42:49,Hi kak. Wah makasih banget ya kak buat ulasann...,2024-06-14 16:09:00,3.27.09
3,51580ece-fa77-468b-8800-2db817bd270f,yumna zkyh,https://play-lh.googleusercontent.com/a/ACg8oc...,gampang dalam berbelanja dengan metode apapun ...,5,0,3.27.09,2024-06-14 15:42:24,"Yuhuu mantulll, mimin ikut seneng kalau kamu p...",2024-06-14 16:16:36,3.27.09
4,4cbe17c5-9fed-4e14-94f4-7a192336cf72,messa pada,https://play-lh.googleusercontent.com/a/ACg8oc...,Online Shop 👍💯,5,0,3.27.09,2024-06-14 15:42:07,"Hi kak, makasih ya buat full bintangnya. Moga ...",2024-06-14 16:19:02,3.27.09
5,5d6787e9-3b57-492c-94a2-6661d1bd3946,Jamal Noor,https://play-lh.googleusercontent.com/a-/ALV-U...,Shopee mantul pokonamah,5,0,3.27.09,2024-06-14 15:40:34,"Hi kak, makasih ya buat full bintangnya. Moga ...",2024-06-14 16:24:27,3.27.09
6,2c862376-7ed0-4386-9abf-3873362f7c7c,Sarining Notopambudi,https://play-lh.googleusercontent.com/a/ACg8oc...,"Mudah,terjangkau",5,0,3.27.09,2024-06-14 15:37:49,"Hi kak, makasih ya buat full bintangnya. Moga ...",2024-06-14 16:22:23,3.27.09
7,95ba1c5f-5e1d-4e3f-993c-faa12bab98b1,Yanti HW,https://play-lh.googleusercontent.com/a-/ALV-U...,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5,0,3.27.09,2024-06-14 15:36:50,,NaT,3.27.09
8,f8b96945-273c-4c17-92c1-8038e43e47c8,Hastina Zakaria,https://play-lh.googleusercontent.com/a/ACg8oc...,Good,5,0,,2024-06-14 15:34:29,"Hi kak, makasih ya buat full bintangnya. Moga ...",2024-06-14 16:28:49,
9,910b790e-aae1-42a2-916d-5bd63f71ca54,Arsya Phonsel,https://play-lh.googleusercontent.com/a-/ALV-U...,Ayo download shopeee sebelum menyesal.. by Jac...,5,0,,2024-06-14 15:34:20,"Hi kak, makasih ya buat full bintangnya. Moga ...",2024-06-14 16:23:47,


In [None]:
len(df)

10149

In [None]:
df[['userName', 'score','at', 'content']].head(20)  #dari scrapping tsb didapatkan banyak sekali kolom, kemudian kolom" tsb kita filter
                                                        #sehingga didapatkan kolom username, score, at dan content

Unnamed: 0,userName,score,at,content
0,Smile Desune,1,2024-06-14 15:46:08,Sekarang nyari pods vape roko elektrik ga bisa...
1,Ahmad Alkahfi,5,2024-06-14 15:43:56,Keren
2,Akbar Fadilah,5,2024-06-14 15:42:49,shopi mantul...
3,yumna zkyh,5,2024-06-14 15:42:24,gampang dalam berbelanja dengan metode apapun ...
4,messa pada,5,2024-06-14 15:42:07,Online Shop 👍💯
5,Jamal Noor,5,2024-06-14 15:40:34,Shopee mantul pokonamah
6,Sarining Notopambudi,5,2024-06-14 15:37:49,"Mudah,terjangkau"
7,Yanti HW,5,2024-06-14 15:36:50,Hai ulasan sy revisi yaa Dr awal akun Shopee a...
8,Hastina Zakaria,5,2024-06-14 15:34:29,Good
9,Arsya Phonsel,5,2024-06-14 15:34:20,Ayo download shopeee sebelum menyesal.. by Jac...


In [None]:
df=df[['content', 'score']]#karena kita hanya membutuhkan kolom content dan score maka kita lakukan filter kolom lgi hing
df.head(20)

Unnamed: 0,content,score
0,Sekarang nyari pods vape roko elektrik ga bisa...,1
1,Keren,5
2,shopi mantul...,5
3,gampang dalam berbelanja dengan metode apapun ...,5
4,Online Shop 👍💯,5
5,Shopee mantul pokonamah,5
6,"Mudah,terjangkau",5
7,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5
8,Good,5
9,Ayo download shopeee sebelum menyesal.. by Jac...,5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10149 entries, 0 to 10148
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  10149 non-null  object
 1   score    10149 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 158.7+ KB


In [None]:
df.isna()

Unnamed: 0,content,score
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
10144,False,False
10145,False,False
10146,False,False
10147,False,False


In [None]:
df.describe()

Unnamed: 0,score
count,10149.0
mean,4.173416
std,1.461865
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [None]:
df.isnull().sum()

content    0
score      0
dtype: int64

In [None]:
df.to_csv("hasil-scrap-shopee.csv", index = False) #simpan csv

# **PREPROCESSING**

Case Folding

In [None]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    return df

In [None]:
df['text_clean'] = df['content'].str.lower()
df['text_clean']
data_clean = clean_text(df, 'content', 'text_clean')
data_clean.head(20)

Unnamed: 0,content,score,text_clean,text_StopWord
0,Sekarang nyari pods vape roko elektrik ga bisa...,1,sekarang nyari pods vape roko elektrik ga bisa...,nyari pods vape roko elektrik ga stress
1,Keren,5,keren,keren
2,shopi mantul...,5,shopi mantul,shopi mantul
3,gampang dalam berbelanja dengan metode apapun ...,5,gampang dalam berbelanja dengan metode apapun ...,gampang berbelanja metode apapun ongkirnya lhoo
4,Online Shop 👍💯,5,online shop,online shop
5,Shopee mantul pokonamah,5,shopee mantul pokonamah,shopee mantul pokonamah
6,"Mudah,terjangkau",5,mudahterjangkau,mudahterjangkau
7,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5,hai ulasan sy revisi yaa dr awal akun shopee a...,hai ulasan sy revisi yaa dr akun shopee adaudh...
8,Good,5,good,good
9,Ayo download shopeee sebelum menyesal.. by Jac...,5,ayo download shopeee sebelum menyesal by jack ...,ayo download shopeee menyesal by jack bayeun aceh


Stopword Removal

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('indonesian')
data_clean['text_StopWord'] = data_clean['text_clean'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head(20)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,score,text_clean,text_StopWord
0,Sekarang nyari pods vape roko elektrik ga bisa...,1,sekarang nyari pods vape roko elektrik ga bisa...,nyari pods vape roko elektrik ga stress
1,Keren,5,keren,keren
2,shopi mantul...,5,shopi mantul,shopi mantul
3,gampang dalam berbelanja dengan metode apapun ...,5,gampang dalam berbelanja dengan metode apapun ...,gampang berbelanja metode apapun ongkirnya lhoo
4,Online Shop 👍💯,5,online shop,online shop
5,Shopee mantul pokonamah,5,shopee mantul pokonamah,shopee mantul pokonamah
6,"Mudah,terjangkau",5,mudahterjangkau,mudahterjangkau
7,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5,hai ulasan sy revisi yaa dr awal akun shopee a...,hai ulasan sy revisi yaa dr akun shopee adaudh...
8,Good,5,good,good
9,Ayo download shopeee sebelum menyesal.. by Jac...,5,ayo download shopeee sebelum menyesal by jack ...,ayo download shopeee menyesal by jack bayeun aceh


**Tokenizing**


In [None]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_StopWord'].apply(lambda x: word_tokenize(x))
data_clean.head(20)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens
0,Sekarang nyari pods vape roko elektrik ga bisa...,1,sekarang nyari pods vape roko elektrik ga bisa...,nyari pods vape roko elektrik ga stress,"[nyari, pods, vape, roko, elektrik, ga, stress]"
1,Keren,5,keren,keren,[keren]
2,shopi mantul...,5,shopi mantul,shopi mantul,"[shopi, mantul]"
3,gampang dalam berbelanja dengan metode apapun ...,5,gampang dalam berbelanja dengan metode apapun ...,gampang berbelanja metode apapun ongkirnya lhoo,"[gampang, berbelanja, metode, apapun, ongkirny..."
4,Online Shop 👍💯,5,online shop,online shop,"[online, shop]"
5,Shopee mantul pokonamah,5,shopee mantul pokonamah,shopee mantul pokonamah,"[shopee, mantul, pokonamah]"
6,"Mudah,terjangkau",5,mudahterjangkau,mudahterjangkau,[mudahterjangkau]
7,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5,hai ulasan sy revisi yaa dr awal akun shopee a...,hai ulasan sy revisi yaa dr akun shopee adaudh...,"[hai, ulasan, sy, revisi, yaa, dr, akun, shope..."
8,Good,5,good,good,[good]
9,Ayo download shopeee sebelum menyesal.. by Jac...,5,ayo download shopeee sebelum menyesal by jack ...,ayo download shopeee menyesal by jack bayeun aceh,"[ayo, download, shopeee, menyesal, by, jack, b..."


**Stemming**

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
#-----------------STEMMING -----------------
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
hitung=0

for document in data_clean['text_tokens']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    hitung+=1
    print(hitung,":",term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]


#script ini bisa dipisah dari eksekusinya setelah pembacaaan term selesai
data_clean['text_steamindo'] = data_clean['text_tokens'].apply(lambda x:' '.join(get_stemmed_term(x)))
data_clean.head(20)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
3928 : hasi : has
3929 : kepuasan : puas
3930 : spylater : spylater
3931 : pulau : pulau
3932 : tengkyu : tengkyu
3933 : rebahan : rebah
3934 : puasjadi : puasjadi
3935 : deskripsian : deskripsi
3936 : pengantarannya : antar
3937 : bagusbiasa : bagusbiasa
3938 : hrsekarang : hrsekarang
3939 : hr : hr
3940 : nemberi : nemberi
3941 : lemotdi : lemotdi
3942 : refreshing : refreshing
3943 : bagushapus : bagushapus
3944 : jugamembuat : jugamembuat
3945 : wawwww : wawwww
3946 : sea : sea
3947 : sebelumnyaaplikasi : sebelumnyaaplikasi
3948 : elu : elu
3949 : ngapa : ngapa
3950 : jengkel : jengkel
3951 : na : na
3952 : sellau : sellau
3953 : penggunaanyaharga : penggunaanyaharga
3954 : murahsayang : murahsayang
3955 : kyak : kyak
3956 : goib : goib
3957 : tolonglh : tolonglh
3958 : instruktur : instruktur
3959 : shopeesaya : shopeesaya
3960 : kelar : kelar
3961 : dapetin : dapetin
3962 : barubagaimana : barubagaimana
3963

Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens,text_steamindo
0,Sekarang nyari pods vape roko elektrik ga bisa...,1,sekarang nyari pods vape roko elektrik ga bisa...,nyari pods vape roko elektrik ga stress,"[nyari, pods, vape, roko, elektrik, ga, stress]",nyari pods vape roko elektrik ga stress
1,Keren,5,keren,keren,[keren],keren
2,shopi mantul...,5,shopi mantul,shopi mantul,"[shopi, mantul]",shopi mantul
3,gampang dalam berbelanja dengan metode apapun ...,5,gampang dalam berbelanja dengan metode apapun ...,gampang berbelanja metode apapun ongkirnya lhoo,"[gampang, berbelanja, metode, apapun, ongkirny...",gampang belanja metode apa ongkirnya lhoo
4,Online Shop 👍💯,5,online shop,online shop,"[online, shop]",online shop
5,Shopee mantul pokonamah,5,shopee mantul pokonamah,shopee mantul pokonamah,"[shopee, mantul, pokonamah]",shopee mantul pokonamah
6,"Mudah,terjangkau",5,mudahterjangkau,mudahterjangkau,[mudahterjangkau],mudahterjangkau
7,Hai ulasan sy revisi yaa Dr awal akun Shopee a...,5,hai ulasan sy revisi yaa dr awal akun shopee a...,hai ulasan sy revisi yaa dr akun shopee adaudh...,"[hai, ulasan, sy, revisi, yaa, dr, akun, shope...",hai ulas sy revisi yaa dr akun shopee adaudh j...
8,Good,5,good,good,[good],good
9,Ayo download shopeee sebelum menyesal.. by Jac...,5,ayo download shopeee sebelum menyesal by jack ...,ayo download shopeee menyesal by jack bayeun aceh,"[ayo, download, shopeee, menyesal, by, jack, b...",ayo download shopeee sesal by jack bayeun aceh


In [None]:
data_clean[['text_steamindo']]

Unnamed: 0,text_steamindo
0,nyari pods vape roko elektrik ga stress
1,keren
2,shopi mantul
3,gampang belanja metode apa ongkirnya lhoo
4,online shop
...,...
10144,kak nilai bintang karna belanja online voucher
10145,iklan nya gk hapus
10146,bagus
10147,apk burik


In [None]:
data_clean.to_csv('shopee.csv', index= False) #kemudian simpan hasil text preprocessing ke file csv