In [None]:
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
import google_play_scraper

In [None]:
app_id = 'com.tokopedia.tkpd'

In [None]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [None]:
reviews_count = 10000

In [None]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='id', #The language of review
            country='id', #Country for which you want to scrape
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

10149it [00:12, 837.70it/s]


In [None]:
import numpy as np
df = pd.DataFrame(np.array(result),columns=['review'])

df = df.join(pd.DataFrame(df.pop('review').tolist()))

df.head(20)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,29c522fc-5fb1-4aec-9685-545b54d32ccf,Asep Ridwan,https://play-lh.googleusercontent.com/a-/ALV-U...,aplikasi paling complete nyaman cepat terperca...,5,0,3.267,2024-06-14 15:37:09,"Toppers, terima kasih atas rating yang kamu be...",2024-06-14 15:40:03,3.267
1,8e58ec7e-1558-4bdc-a5a1-183c982a2314,Marini Putri,https://play-lh.googleusercontent.com/a-/ALV-U...,"Lemot bgt, pas dicancel malahan vouchernya ang...",1,0,3.267,2024-06-14 15:35:51,"Hi Toppers, silakan sampaikan kritik/saran ata...",2024-06-14 15:40:07,3.267
2,042688db-fa7c-4b54-8213-a92698236b04,Andik Widjaya,https://play-lh.googleusercontent.com/a-/ALV-U...,Apk sampah loading Mulu waktu mau pembayaran,1,0,3.267,2024-06-14 15:33:31,"Toppers, mohon maaf atas kendala yang kamu ala...",2024-06-14 15:40:04,3.267
3,51759b1b-8861-47b3-ab30-b4c36935b2e6,Divv,https://play-lh.googleusercontent.com/a/ACg8oc...,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1,0,3.267,2024-06-14 15:06:53,"Halo Toppers, terima kasih atas rating yang ka...",2024-06-14 15:20:05,3.267
4,4a09ebcf-644e-413c-8213-90c9274050cb,Bella Cantika,https://play-lh.googleusercontent.com/a/ACg8oc...,Tokopedia bagus,5,0,,2024-06-14 14:57:40,"Hi Toppers, terima kasih untuk rating dan ulas...",2024-06-14 15:00:04,
5,309ee5c7-ecfb-4a5d-bb7f-7abebe94e93c,Muhammad Fauzan,https://play-lh.googleusercontent.com/a-/ALV-U...,"Sejak ada kurir rekomendasi, pengiriman malah ...",3,1,3.247.1,2024-06-14 14:53:05,"Terima kasih atas rating yang kamu berikan, To...",2024-06-14 15:00:03,3.247.1
6,b7721003-fca1-4d19-bdb3-36c0fce8042c,Umar Alfaruq,https://play-lh.googleusercontent.com/a/ACg8oc...,makasih mantap aplikasinya👍👍👍,5,0,3.266,2024-06-14 14:49:35,"Hi Toppers, terima kasih untuk rating dan ulas...",2024-06-14 15:00:05,3.266
7,d29d7d14-bd12-4f82-b653-094159a3365a,Imam Bahrain,https://play-lh.googleusercontent.com/a-/ALV-U...,keren,5,0,3.267,2024-06-14 14:29:37,"Toppers, terima kasih atas rating yang kamu be...",2024-06-14 14:40:02,3.267
8,579bd221-e3e2-45ed-b7c2-6e0e93bd3ee1,Riza Fikriyah,https://play-lh.googleusercontent.com/a/ACg8oc...,"bagusss, tmbhn diskonnya yaaa.....",5,0,3.267,2024-06-14 14:26:56,"Toppers, terima kasih atas rating yang kamu be...",2024-06-14 14:40:05,3.267
9,3cd73b81-4cbd-4ae6-8033-e8555cbe4d90,riris yuli endah lestari,https://play-lh.googleusercontent.com/a/ACg8oc...,sebernya suka cuma kenapa dana ferund di bawah...,3,0,3.267,2024-06-14 14:13:36,"Terima kasih atas rating yang kamu berikan, To...",2024-06-14 14:20:15,3.267


In [None]:
len(df)

10149

In [None]:
df[['userName', 'score','at', 'content']].head(20)  #dari scrapping tsb didapatkan banyak sekali kolom, kemudian kolom" tsb kita filter
                                                        #sehingga didapatkan kolom username, score, at dan content

Unnamed: 0,userName,score,at,content
0,Asep Ridwan,5,2024-06-14 15:37:09,aplikasi paling complete nyaman cepat terperca...
1,Marini Putri,1,2024-06-14 15:35:51,"Lemot bgt, pas dicancel malahan vouchernya ang..."
2,Andik Widjaya,1,2024-06-14 15:33:31,Apk sampah loading Mulu waktu mau pembayaran
3,Divv,1,2024-06-14 15:06:53,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU..."
4,Bella Cantika,5,2024-06-14 14:57:40,Tokopedia bagus
5,Muhammad Fauzan,3,2024-06-14 14:53:05,"Sejak ada kurir rekomendasi, pengiriman malah ..."
6,Umar Alfaruq,5,2024-06-14 14:49:35,makasih mantap aplikasinya👍👍👍
7,Imam Bahrain,5,2024-06-14 14:29:37,keren
8,Riza Fikriyah,5,2024-06-14 14:26:56,"bagusss, tmbhn diskonnya yaaa....."
9,riris yuli endah lestari,3,2024-06-14 14:13:36,sebernya suka cuma kenapa dana ferund di bawah...


In [None]:
df=df[['content', 'score']]#karena kita hanya membutuhkan kolom content dan score maka kita lakukan filter kolom lgi hing
df.head(20)

Unnamed: 0,content,score
0,aplikasi paling complete nyaman cepat terperca...,5
1,"Lemot bgt, pas dicancel malahan vouchernya ang...",1
2,Apk sampah loading Mulu waktu mau pembayaran,1
3,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1
4,Tokopedia bagus,5
5,"Sejak ada kurir rekomendasi, pengiriman malah ...",3
6,makasih mantap aplikasinya👍👍👍,5
7,keren,5
8,"bagusss, tmbhn diskonnya yaaa.....",5
9,sebernya suka cuma kenapa dana ferund di bawah...,3


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10149 entries, 0 to 10148
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  10149 non-null  object
 1   score    10149 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 158.7+ KB


In [None]:
df.isna()

Unnamed: 0,content,score
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
10144,False,False
10145,False,False
10146,False,False
10147,False,False


In [None]:
df.describe()

Unnamed: 0,score
count,10149.0
mean,3.168292
std,1.869157
min,1.0
25%,1.0
50%,4.0
75%,5.0
max,5.0


In [None]:
df.isnull().sum()

content    0
score      0
dtype: int64

In [None]:
df.to_csv("hasil-scrap-tokopedia.csv", index = False) #simpan csv

# **PREPROCESSING**

Case Folding

In [None]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    return df

In [None]:
df['text_clean'] = df['content'].str.lower()
df['text_clean']
data_clean = clean_text(df, 'content', 'text_clean')
data_clean.head(20)

Unnamed: 0,content,score,text_clean
0,aplikasi paling complete nyaman cepat terperca...,5,aplikasi paling complete nyaman cepat terperca...
1,"Lemot bgt, pas dicancel malahan vouchernya ang...",1,lemot bgt pas dicancel malahan vouchernya angu...
2,Apk sampah loading Mulu waktu mau pembayaran,1,apk sampah loading mulu waktu mau pembayaran
3,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1,apk gajelass anjjj baru pertama kali buat akun...
4,Tokopedia bagus,5,tokopedia bagus
5,"Sejak ada kurir rekomendasi, pengiriman malah ...",3,sejak ada kurir rekomendasi pengiriman malah j...
6,makasih mantap aplikasinya👍👍👍,5,makasih mantap aplikasinya
7,keren,5,keren
8,"bagusss, tmbhn diskonnya yaaa.....",5,bagusss tmbhn diskonnya yaaa
9,sebernya suka cuma kenapa dana ferund di bawah...,3,sebernya suka cuma kenapa dana ferund di bawah...


Stopword Removal

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('indonesian')
data_clean['text_StopWord'] = data_clean['text_clean'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head(50)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,content,score,text_clean,text_StopWord
0,aplikasi paling complete nyaman cepat terperca...,5,aplikasi paling complete nyaman cepat terperca...,aplikasi complete nyaman cepat terpercaya teri...
1,"Lemot bgt, pas dicancel malahan vouchernya ang...",1,lemot bgt pas dicancel malahan vouchernya angu...,lemot bgt pas dicancel vouchernya angus fak
2,Apk sampah loading Mulu waktu mau pembayaran,1,apk sampah loading mulu waktu mau pembayaran,apk sampah loading mulu pembayaran
3,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1,apk gajelass anjjj baru pertama kali buat akun...,apk gajelass anjjj kali akun pesanan dibatalka...
4,Tokopedia bagus,5,tokopedia bagus,tokopedia bagus
5,"Sejak ada kurir rekomendasi, pengiriman malah ...",3,sejak ada kurir rekomendasi pengiriman malah j...,kurir rekomendasi pengiriman jarak kota berdek...
6,makasih mantap aplikasinya👍👍👍,5,makasih mantap aplikasinya,makasih mantap aplikasinya
7,keren,5,keren,keren
8,"bagusss, tmbhn diskonnya yaaa.....",5,bagusss tmbhn diskonnya yaaa,bagusss tmbhn diskonnya yaaa
9,sebernya suka cuma kenapa dana ferund di bawah...,3,sebernya suka cuma kenapa dana ferund di bawah...,sebernya suka dana ferund ngk ditarik belanjakan


**Tokenizing**


In [None]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_StopWord'].apply(lambda x: word_tokenize(x))
data_clean.head(20)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens
0,aplikasi paling complete nyaman cepat terperca...,5,aplikasi paling complete nyaman cepat terperca...,aplikasi complete nyaman cepat terpercaya teri...,"[aplikasi, complete, nyaman, cepat, terpercaya..."
1,"Lemot bgt, pas dicancel malahan vouchernya ang...",1,lemot bgt pas dicancel malahan vouchernya angu...,lemot bgt pas dicancel vouchernya angus fak,"[lemot, bgt, pas, dicancel, vouchernya, angus,..."
2,Apk sampah loading Mulu waktu mau pembayaran,1,apk sampah loading mulu waktu mau pembayaran,apk sampah loading mulu pembayaran,"[apk, sampah, loading, mulu, pembayaran]"
3,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1,apk gajelass anjjj baru pertama kali buat akun...,apk gajelass anjjj kali akun pesanan dibatalka...,"[apk, gajelass, anjjj, kali, akun, pesanan, di..."
4,Tokopedia bagus,5,tokopedia bagus,tokopedia bagus,"[tokopedia, bagus]"
5,"Sejak ada kurir rekomendasi, pengiriman malah ...",3,sejak ada kurir rekomendasi pengiriman malah j...,kurir rekomendasi pengiriman jarak kota berdek...,"[kurir, rekomendasi, pengiriman, jarak, kota, ..."
6,makasih mantap aplikasinya👍👍👍,5,makasih mantap aplikasinya,makasih mantap aplikasinya,"[makasih, mantap, aplikasinya]"
7,keren,5,keren,keren,[keren]
8,"bagusss, tmbhn diskonnya yaaa.....",5,bagusss tmbhn diskonnya yaaa,bagusss tmbhn diskonnya yaaa,"[bagusss, tmbhn, diskonnya, yaaa]"
9,sebernya suka cuma kenapa dana ferund di bawah...,3,sebernya suka cuma kenapa dana ferund di bawah...,sebernya suka dana ferund ngk ditarik belanjakan,"[sebernya, suka, dana, ferund, ngk, ditarik, b..."


**Stemming**

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
#-----------------STEMMING -----------------
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
hitung=0

for document in data_clean['text_tokens']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    hitung+=1
    print(hitung,":",term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]


#script ini bisa dipisah dari eksekusinya setelah pembacaaan term selesai
data_clean['text_steamindo'] = data_clean['text_tokens'].apply(lambda x:' '.join(get_stemmed_term(x)))
data_clean.head(20)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
6296 : roda : roda
6297 : ekspedisipadahal : ekspedisipadahal
6298 : nyangeri : nyangeri
6299 : mesi : mes
6300 : merusak : rusak
6301 : atuh : atuh
6302 : gratisan : gratis
6303 : ecommere : ecommere
6304 : dimalingin : dimalingin
6305 : aset : aset
6306 : amankan : aman
6307 : don : don
6308 : katepe : katepe
6309 : semuah : semuah
6310 : kuasai : kuasa
6311 : olek : olek
6312 : mendapak : mendapak
6313 : ida : ida
6314 : iklimah : iklimah
6315 : minjam : minjam
6316 : pemilikkarena : pemilikkarena
6317 : siiip : siiip
6318 : searchinghasil : searchinghasil
6319 : mengutamakan : utama
6320 : prosedur : prosedur
6321 : dibatlkan : dibatlkan
6322 : sragen : sragen
6323 : membatasi : batas
6324 : seringsering : seringsering
6325 : chattingan : chattingan
6326 : menghabiskan : habis
6327 : membahas : bahas
6328 : saatbikin : saatbikin
6329 : sekarangg : sekarangg
6330 : apasihhhhhh : apasihhhhhh
6331 : nalar : nalar

Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens,text_steamindo
0,aplikasi paling complete nyaman cepat terperca...,5,aplikasi paling complete nyaman cepat terperca...,aplikasi complete nyaman cepat terpercaya teri...,"[aplikasi, complete, nyaman, cepat, terpercaya...",aplikasi complete nyaman cepat percaya terimka...
1,"Lemot bgt, pas dicancel malahan vouchernya ang...",1,lemot bgt pas dicancel malahan vouchernya angu...,lemot bgt pas dicancel vouchernya angus fak,"[lemot, bgt, pas, dicancel, vouchernya, angus,...",lot bgt pas dicancel vouchernya angus fak
2,Apk sampah loading Mulu waktu mau pembayaran,1,apk sampah loading mulu waktu mau pembayaran,apk sampah loading mulu pembayaran,"[apk, sampah, loading, mulu, pembayaran]",apk sampah loading mulu bayar
3,"APK GAJELASS ANJJJ, BARU PERTAMA KALI BUAT AKU...",1,apk gajelass anjjj baru pertama kali buat akun...,apk gajelass anjjj kali akun pesanan dibatalka...,"[apk, gajelass, anjjj, kali, akun, pesanan, di...",apk gajelass anjjj kali akun pesan batal dgn a...
4,Tokopedia bagus,5,tokopedia bagus,tokopedia bagus,"[tokopedia, bagus]",tokopedia bagus
5,"Sejak ada kurir rekomendasi, pengiriman malah ...",3,sejak ada kurir rekomendasi pengiriman malah j...,kurir rekomendasi pengiriman jarak kota berdek...,"[kurir, rekomendasi, pengiriman, jarak, kota, ...",kurir rekomendasi kirim jarak kota dekat anter...
6,makasih mantap aplikasinya👍👍👍,5,makasih mantap aplikasinya,makasih mantap aplikasinya,"[makasih, mantap, aplikasinya]",makasih mantap aplikasi
7,keren,5,keren,keren,[keren],keren
8,"bagusss, tmbhn diskonnya yaaa.....",5,bagusss tmbhn diskonnya yaaa,bagusss tmbhn diskonnya yaaa,"[bagusss, tmbhn, diskonnya, yaaa]",bagusss tmbhn diskon yaaa
9,sebernya suka cuma kenapa dana ferund di bawah...,3,sebernya suka cuma kenapa dana ferund di bawah...,sebernya suka dana ferund ngk ditarik belanjakan,"[sebernya, suka, dana, ferund, ngk, ditarik, b...",sebernya suka dana ferund ngk tarik belanja


In [None]:
data_clean.to_csv('tokopedia.csv', index= False) #kemudian simpan hasil text preprocessing ke file csv