In [242]:
# Import the required libraries
# webdriver is a library that allows to control a web browser
from selenium import webdriver

# WebDriverWait is a library that allows to wait for a certain condition to be met
from selenium.webdriver.support.ui import WebDriverWait

# By is a library that allows to select elements on a web page
from selenium.webdriver.common.by import By

# expected_conditions is a library to specify the condition that needs to be met
from selenium.webdriver.support import expected_conditions as EC

# BeautifulSoup is a library that makes it easy to scrape information from web pages
from bs4 import BeautifulSoup

# time is a library that allows to pause the program for a certain amount of time
import time

In [243]:
# url is a variable that contains the URL of the web page that the program will scrape
#
# Replace the value of the variable with the URL of the web page that you want to scrape
# from product reviews
url: str = (
    "https://www.tokopedia.com/digitechmall/apple-iphone-15-pro-max-promax-128gb-256gb-512gb-resmi-apple-indonesia-15-promax-1tb-natural-0f1a0/review"
)

In [244]:
# page_total_to_scrape is a variable that contains the number of pages that the program will scrape
#
# Replace the value of the variable with the number of pages that you want to scrape
page_total_to_scrape: int = 2

In [245]:
# rows_to_scrolls is a variable that contains the number of rows that the program will scroll
#
# Replace the value of the variable with the number of rows that you want to scroll
rows_to_scrolls: int = 35

In [246]:
# timeout is a variable that contains the number of seconds to wait for a response from the server
# This prevent the program from getting blocked from the server
timeout: int = 10

In [None]:
# configparser is a module to work with configuration files
import configparser

In [None]:
# config is an object of RawConfigParser class
config = configparser.RawConfigParser()

# Read the configuration file
config.read("app.properties")

In [247]:
# csv_file_path is a variable that contains the path of the CSV file that the program will create
# to store the result of scraped data
#
# To update the path of the CSV file path, update the value of csv.path in app.properties file
csv_file_path: str = config.get("CSVSection", "csv.path")

In [248]:
# options is a variable that contains the options to start the web browser
# with certain arguments
#
# Make sure to replace the option method with the web browser that you are using
# For example, if you are using Google Chrome, replace the method with ChromeOptions
options = webdriver.FirefoxOptions()

# Add the argument to start the web browser in a minimized window
options.add_argument("--start-minimized")

In [249]:
# driver is a variable that contains the web browser that the program will control
# In this case, the program will open the web browser automatically
#
# Make sure to replace the web browser with the web browser that
# you have installed on your computer
driver = webdriver.Firefox()

In [250]:
# Controls the web browser to open the URL
driver.get(url)

In [251]:
# Parse the HTML content of the web page using BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Print the HTML content of the web page
print(soup)

<html data-rh="lang,translate" lang="id" translate="no"><head>
<title>Review APPLE iPhone 15 Pro Max Promax 128GB 256GB 512GB Resmi APPLE Indonesia - 15 Promax 1TB, Natural | Tokopedia</title>
<meta charset="utf-8"/>
<meta content="initial-scale=1, minimum-scale=1, maximum-scale=5, user-scalable=yes, width=device-width" name="viewport"/>
<link crossorigin="" href="https://assets.tokopedia.net" rel="preconnect"/><link href="https://assets.tokopedia.net" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://gql.tokopedia.com" rel="preconnect"/><link href="https://gql.tokopedia.com" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://googleads.g.doubleclick.net" rel="preconnect"/><link href="https://googleads.g.doubleclick.net" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://www.google-analytics.com" rel="preconnect"/><link href="https://www.google-analytics.com" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://images.tokopedia.net"

In [252]:
# data is a variable that contains the result of the scraping process
data: list = []

In [253]:
def extract_data(review) -> list:
    """
    extract_item_data is a function that extracts the data of an item on the web page

    Parameters
    ----------
    - item: The item that the function will extract the data from

    Returns
    -------
    A list that contains the data of the item
    """
    
    # review_rating is a variable that contains the rating of the review
    review_message = review.find("span", attrs={"data-testid": "lblItemUlasan"})

    # Check if review_message is None
    if review_message is None:
        review_message = ""
    else:
        # Extract the text of the review_message
        review_message = review_message.text

    return [review_message]

In [254]:
# Loop through the number of pages that the program will scrape
for page in range(0, page_total_to_scrape - 1):
    # Waits for the web page to load completely for a certain amount of time
    # The web page is considered loaded when the element with the class name "zeus-root" is present
    WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "#zeus-root"))
    )

    # Wait for a few seconds before scrapping data from web page
    time.sleep(timeout)

    # Loop through the number of rows that the program will scrape
    for rows in range(rows_to_scrolls):
        # Execute JavaScript to scroll to the bottom of the page
        driver.execute_script("window.scrollBy(0, 250);")

        # Wait for a few seconds before continue to next page
        time.sleep(1)

    driver.execute_script("window.scrollBy(50, 0);")

    # Wait for a few seconds before continue to next step
    time.sleep(1)

    # Parse the HTML content of the web page using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # reviews is a variable that contains the product reviews
    reviews = soup.findAll("article", class_="css-72zbc4")

    # Loop through the product reviews
    for review in reviews:
        # There is a possibility that the program will encounter an error
        # for example when the program cannot find the element that it is looking for
        # especially when the product review is not available
        try:
            # Extract the data of the product review
            extracted_data = extract_data(review)

            # Append the extracted data to the data variable
            data.append(extracted_data)
        except Exception as e:
            continue

    # Wait for a few seconds before continue to next page
    time.sleep(2)
    
    # Procceed to the next page by clicking the next page button
    driver.find_element(
        By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']"
    ).click()

    # Wait for a few seconds before contining proccess in the next page
    time.sleep(3)

In [255]:
# pandas is a library that allows to work with data in a tabular format
import pandas as pd

In [256]:
# df is a variable that contains the result of the scraping process in a tabular format
# The tabular format makes it easier to analyze the data
df = pd.DataFrame(
    data,
    columns=[
        "Review",
    ],
)

# Print the result of the scraping process
df

Unnamed: 0,Review
0,Barang Ori & cepat prosesnya
1,"Packing aman, dan unit kondisi baik."
2,
3,👍👍👍👍👍👍👍
4,Bagus
5,"Penjual responsif, produk yang diterima sesuai..."
6,Barang original. Bagus. Sampai cepat. Rekomend...
7,"Packingnya aman, barangnya juga masih segel, d..."
8,Barangnya mulus mendarat
9,ok


In [257]:
# re is a library that allows to work with regular expressions
import re

# emoji is a library that allows to work with emojis
import emoji

# string is a library that allows to work with strings
import string

In [258]:
# nltk.tokenize is a library that allows to tokenize the text
# word_tokenize is a function that tokenizes the text into words
from nltk.tokenize import word_tokenize

# nltk.corpus is a library that allows to work with the corpus
# stopwords is a list of common words that are not useful for analysis
from nltk.corpus import stopwords

# nltk is a library that allows to work with natural language processing
import nltk

In [259]:
# nltk.download is a function that downloads the required data for natural language processing
nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [260]:
def tokenize(reviews) -> pd.DataFrame:
    """
    tokenize is a function that tokenizes the text into words

    Parameters
    ----------
    - df: The data that the function will tokenize

    Returns
    -------
    The tokenized data
    """

    # Tokenize the word
    tokens = word_tokenize(reviews)

    # Remove stop words
    stop_words = set(stopwords.words("indonesian"))

    # Remove the stop words from the tokens
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]

    return tokens_no_stopwords

In [261]:
# Sastrawi is a library that allows to stem the words
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [262]:
def stemming(reviews):
    """
    stemming is a function that stems the words

    Parameters
    ----------
    - df: The data that the function will stem

    Returns
    -------
    The stemmed data
    """

    # factory is a variable that contains the factory to create a stemmer
    factory = StemmerFactory()

    # stemmer is a variable that contains the stemmer
    stemmer = factory.create_stemmer()
    
    # Stem the words in dataframe
    return [stemmer.stem(word) for word in reviews]

In [263]:
def pre_process(df: pd.DataFrame) -> pd.DataFrame:
    """'
    pre_process is a function that preprocesses the data before saving it to a CSV file

    Parameters
    ----------
    - df: The data that the function will preprocess

    Returns
    -------
    The preprocessed data
    """

    # Convert the "Review" column to lowercase and remove the leading and trailing whitespaces
    df["Review"] = df["Review"].apply(lambda x: x.lower().strip())

    # Remove the extra whitespaces from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: re.sub("\s+", " ", x))

    # Remove the emojis from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: emoji.replace_emoji(x, replace=""))

    # Remove the punctuation from the "Review" column
    df["Review"] = df["Review"].apply(
        lambda x: x.translate(str.maketrans("", "", string.punctuation))
    )

    # Remove the numbers from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: re.sub(r"\d+", "", x))

    # Call the tokenize function to tokenzie the words
    df["Review"] = df["Review"].apply(lambda x: tokenize(x))

    # Call the stemming function to stem the words
    df["Review"] = df["Review"].apply(lambda x: stemming(x))

    # Join the words in the "Review" column
    df["Review"] = df["Review"].apply(lambda x: " ".join(x))

    # Drop the rows with empty string
    df = df[df["Review"] != ""]

    return df

In [264]:
# Call the pre_process function to preprocess the data
df = pre_process(df=df)

# Print the result of the preprocessing process
df

Unnamed: 0,Review
0,barang ori cepat proses
1,packing aman unit kondisi
4,bagus
5,jual responsif produk terima sesuai pesan reco...
6,barang original bagus cepat rekomendasi
7,packingnya aman barang segel dapet diskon juta...
8,barang mulus darat
9,ok
10,barang terima normal ios update moga lancar te...
11,barang sdh tp test diupdate seller responsif k...


In [265]:
# nltk.sentiment is a library that allows to work with sentiment analysis
# SentimentIntensityAnalyzer is a class that analyzes the sentiment of the text
from nltk.sentiment import SentimentIntensityAnalyzer

In [266]:
# Download the required data for sentiment analysis
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...


True

In [267]:
# analyzer is a variable that contains the sentiment analyzer object
analyzer = SentimentIntensityAnalyzer()

In [268]:
def label_review(review) -> str:
    """'
    label_review is a function that labels the sentiment of the review

    Parameters
    ----------
    - df: The data that the function will label

    Returns
    -------
    The labeled data
    """

    # Perform sentiment analysis on the "Review" column
    # sentiment_score is a variable that contains the sentiment score of the review
    sentiment_score = analyzer.polarity_scores(review)

    # Check if the sentiment score is positive
    # compound_score is a variable that contains the compound score of the review
    compound_score = sentiment_score["compound"]

    # Return the sentiment label based on the compound score
    if compound_score > 0:
        return "positive"
    elif compound_score < 0:
        return "negative"

    return "neutral"

In [269]:
# Apply label review function to the data
df["Label"] = df["Review"].apply(lambda x: label_review(x))

# Print the result of the labeling process
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Label"] = df["Review"].apply(lambda x: label_review(x))


Unnamed: 0,Review,Label
0,barang ori cepat proses,neutral
1,packing aman unit kondisi,neutral
4,bagus,neutral
5,jual responsif produk terima sesuai pesan reco...,positive
6,barang original bagus cepat rekomendasi,positive
7,packingnya aman barang segel dapet diskon juta...,neutral
8,barang mulus darat,neutral
9,ok,positive
10,barang terima normal ios update moga lancar te...,neutral
11,barang sdh tp test diupdate seller responsif k...,neutral


In [270]:
# Save the result of the scraping process to a CSV file
df.to_csv(path_or_buf=csv_file_path, index=False)

In [271]:
# Close the web browser after the scraping process is completed
driver.close()