In [None]:
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
import google_play_scraper

In [None]:
app_id = 'com.lazada.android'

In [None]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [None]:
reviews_count = 10000

In [None]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='id', #The language of review
            country='id', #Country for which you want to scrape
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

10149it [00:13, 742.59it/s]


In [None]:
import numpy as np
df = pd.DataFrame(np.array(result),columns=['review'])

df = df.join(pd.DataFrame(df.pop('review').tolist()))

df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,08b0e3d0-5802-4466-92db-46b736b2ec0b,Rendy Wahyu,https://play-lh.googleusercontent.com/a/ACg8oc...,bagus,5,0,7.52.0,2024-06-14 16:18:31,,NaT,7.52.0
1,c4a41ad2-178d-494c-8ea8-1622441c4ea3,tiara virgina,https://play-lh.googleusercontent.com/a/ACg8oc...,barangnya bagus 👍,5,0,7.52.0,2024-06-14 16:17:59,,NaT,7.52.0
2,e9ae32ff-1e09-4ec0-8183-70804812f017,Satria Minang,https://play-lh.googleusercontent.com/a/ACg8oc...,kenapa gak bisa cod lagi ya,4,0,,2024-06-14 16:16:12,,NaT,
3,ecd87c43-4d41-4522-a7ae-a32e412a4e62,Firdza Firdza,https://play-lh.googleusercontent.com/a-/ALV-U...,good,5,0,7.52.0,2024-06-14 16:13:51,,NaT,7.52.0
4,7b1d0a47-6b50-4093-8dab-cefb33cab028,Triyanto Banjar,https://play-lh.googleusercontent.com/a-/ALV-U...,mntapp,5,0,7.51.0,2024-06-14 16:12:17,,NaT,7.51.0


In [None]:
len(df)

10149

In [None]:
df[['userName', 'score','at', 'content']].head()  #dari scrapping tsb didapatkan banyak sekali kolom, kemudian kolom" tsb kita filter
                                                        #sehingga didapatkan kolom username, score, at dan content

Unnamed: 0,userName,score,at,content
0,Rendy Wahyu,5,2024-06-14 16:18:31,bagus
1,tiara virgina,5,2024-06-14 16:17:59,barangnya bagus 👍
2,Satria Minang,4,2024-06-14 16:16:12,kenapa gak bisa cod lagi ya
3,Firdza Firdza,5,2024-06-14 16:13:51,good
4,Triyanto Banjar,5,2024-06-14 16:12:17,mntapp


In [None]:
df=df[['content', 'score']]#karena kita hanya membutuhkan kolom content dan score maka kita lakukan filter kolom lgi hing
df.head()

Unnamed: 0,content,score
0,bagus,5
1,barangnya bagus 👍,5
2,kenapa gak bisa cod lagi ya,4
3,good,5
4,mntapp,5


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10149 entries, 0 to 10148
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  10149 non-null  object
 1   score    10149 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 158.7+ KB


In [None]:
df.isna()

Unnamed: 0,content,score
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
10144,False,False
10145,False,False
10146,False,False
10147,False,False


In [None]:
df.describe()

Unnamed: 0,score
count,10149.0
mean,4.395408
std,1.255819
min,1.0
25%,5.0
50%,5.0
75%,5.0
max,5.0


In [None]:
df.isnull().sum()

content    0
score      0
dtype: int64

In [None]:
df.to_csv("hasil-scrap-lazada.csv", index = False) #simpan csv

# **PREPROCESSING**

Case Folding

In [None]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    return df

In [None]:
df['text_clean'] = df['content'].str.lower()
df['text_clean']
data_clean = clean_text(df, 'content', 'text_clean')
data_clean.head(10)

Unnamed: 0,content,score,text_clean
0,bagus,5,bagus
1,barangnya bagus 👍,5,barangnya bagus
2,kenapa gak bisa cod lagi ya,4,kenapa gak bisa cod lagi ya
3,good,5,good
4,mntapp,5,mntapp
5,banyak diskon pengiriman aman pengemasan baik,5,banyak diskon pengiriman aman pengemasan baik
6,memudahkan ibu2 yg bekerja,5,memudahkan ibu yg bekerja
7,semoga cepat sampai dan tidak mengecewakan ..,5,semoga cepat sampai dan tidak mengecewakan
8,"saya puas blanja di lazada,barangnya bagus den...",5,saya puas blanja di lazadabarangnya bagus deng...
9,dryio,5,dryio


Stopword Removal

In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('indonesian')
data_clean['text_StopWord'] = data_clean['text_clean'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head(50)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,content,score,text_clean,text_StopWord
0,bagus,5,bagus,bagus
1,barangnya bagus 👍,5,barangnya bagus,barangnya bagus
2,kenapa gak bisa cod lagi ya,4,kenapa gak bisa cod lagi ya,gak cod ya
3,good,5,good,good
4,mntapp,5,mntapp,mntapp
5,banyak diskon pengiriman aman pengemasan baik,5,banyak diskon pengiriman aman pengemasan baik,diskon pengiriman aman pengemasan
6,memudahkan ibu2 yg bekerja,5,memudahkan ibu yg bekerja,memudahkan yg
7,semoga cepat sampai dan tidak mengecewakan ..,5,semoga cepat sampai dan tidak mengecewakan,semoga cepat mengecewakan
8,"saya puas blanja di lazada,barangnya bagus den...",5,saya puas blanja di lazadabarangnya bagus deng...,puas blanja lazadabarangnya bagus harga yg ter...
9,dryio,5,dryio,dryio


**Tokenizing**


In [None]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_StopWord'].apply(lambda x: word_tokenize(x))
data_clean.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens
0,bagus,5,bagus,bagus,[bagus]
1,barangnya bagus 👍,5,barangnya bagus,barangnya bagus,"[barangnya, bagus]"
2,kenapa gak bisa cod lagi ya,4,kenapa gak bisa cod lagi ya,gak cod ya,"[gak, cod, ya]"
3,good,5,good,good,[good]
4,mntapp,5,mntapp,mntapp,[mntapp]


**Stemming**

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
#-----------------STEMMING -----------------
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
#import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}
hitung=0

for document in data_clean['text_tokens']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")
for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    hitung+=1
    print(hitung,":",term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]


#script ini bisa dipisah dari eksekusinya setelah pembacaaan term selesai
data_clean['text_steamindo'] = data_clean['text_tokens'].apply(lambda x:' '.join(get_stemmed_term(x)))
data_clean.head(20)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
2300 : borong : borong
2301 : sua : sua
2302 : berkurang : kurang
2303 : apik : apik
2304 : menjual : jual
2305 : photo : photo
2306 : dikantong : kantong
2307 : loginpdhal : loginpdhal
2308 : wifi : wifi
2309 : bermutu : mutu
2310 : trms : trms
2311 : permasalahan : masalah
2312 : optimalkan : optimal
2313 : mantapp : mantapp
2314 : okee : okee
2315 : lohhh : lohhh
2316 : knapa : knapa
2317 : mantabb : mantabb
2318 : man : man
2319 : faat : faat
2320 : pokonyah : pokonyah
2321 : tamba : tamba
2322 : melebihi : lebih
2323 : batas : batas
2324 : ditunggu : tunggu
2325 : kuwalitas : kuwalitas
2326 : makasi : makas
2327 : membntuuntuk : membntuuntuk
2328 : maci : maci
2329 : blaja : blaja
2330 : nati : nati
2331 : hamba : hamba
2332 : urusan : urus
2333 : gajelasmain : gajelasmain
2334 : nolkecewa : nolkecewa
2335 : eww : eww
2336 : pass : pass
2337 : dibuka : buka
2338 : barangyang : barangyang
2339 : mudahmurah : m

Unnamed: 0,content,score,text_clean,text_StopWord,text_tokens,text_steamindo
0,bagus,5,bagus,bagus,[bagus],bagus
1,barangnya bagus 👍,5,barangnya bagus,barangnya bagus,"[barangnya, bagus]",barang bagus
2,kenapa gak bisa cod lagi ya,4,kenapa gak bisa cod lagi ya,gak cod ya,"[gak, cod, ya]",gak cod ya
3,good,5,good,good,[good],good
4,mntapp,5,mntapp,mntapp,[mntapp],mntapp
5,banyak diskon pengiriman aman pengemasan baik,5,banyak diskon pengiriman aman pengemasan baik,diskon pengiriman aman pengemasan,"[diskon, pengiriman, aman, pengemasan]",diskon kirim aman emas
6,memudahkan ibu2 yg bekerja,5,memudahkan ibu yg bekerja,memudahkan yg,"[memudahkan, yg]",mudah yg
7,semoga cepat sampai dan tidak mengecewakan ..,5,semoga cepat sampai dan tidak mengecewakan,semoga cepat mengecewakan,"[semoga, cepat, mengecewakan]",moga cepat kecewa
8,"saya puas blanja di lazada,barangnya bagus den...",5,saya puas blanja di lazadabarangnya bagus deng...,puas blanja lazadabarangnya bagus harga yg ter...,"[puas, blanja, lazadabarangnya, bagus, harga, ...",puas blanja lazadabarangnya bagus harga yg jan...
9,dryio,5,dryio,dryio,[dryio],dryio


In [None]:
data_clean.to_csv('lazada.csv', index= False) #kemudian simpan hasil text preprocessing ke file csv