In [14]:
# Import the required libraries
# webdriver is a library that allows to control a web browser
from selenium import webdriver

# WebDriverWait is a library that allows to wait for a certain condition to be met
from selenium.webdriver.support.ui import WebDriverWait

# By is a library that allows to select elements on a web page
from selenium.webdriver.common.by import By

# expected_conditions is a library to specify the condition that needs to be met
from selenium.webdriver.support import expected_conditions as EC

# pandas is a library that allows to work with data in a tabular format
import pandas as pd

# BeautifulSoup is a library that makes it easy to scrape information from web pages
from bs4 import BeautifulSoup

# time is a library that allows to pause the program for a certain amount of time
import time

In [15]:
# url is a variable that contains the URL of the web page that the program will scrape
#
# Replace the value of the variable with the URL of the web page that you want to scrape
url: str = (
    "https://www.tokopedia.com/search?q=kabel+lan&source=universe&st=product&navsource=home&srp_component_id=02.02.01.02"
)

In [16]:
# page_total_to_scrape is a variable that contains the number of pages that the program will scrape
#
# Replace the value of the variable with the number of pages that you want to scrape
page_total_to_scrape: int = 3

In [17]:
# rows_to_scrolls is a variable that contains the number of rows that the program will scroll
#
# Replace the value of the variable with the number of rows that you want to scroll
rows_to_scrolls: int = 21

In [18]:
# timeout is a variable that contains the number of seconds to wait for a response from the server
# This prevent the program from getting blocked from the server
timeout: int = 10

In [19]:
# csv_file_path is a variable that contains the path of the CSV file that the program will create
# to store the result of scraped data
#
# Replace the value of the variable with the path where you want to store the CSV file
csv_file_path: str = "tokopedia_kabel_lan.csv"

In [20]:
# driver is a variable that contains the web browser that the program will control
# In this case, the program will open the web browser automatically
#
# Make sure to replace the web browser with the web browser that
# you have installed on your computer
driver = webdriver.Firefox()

In [21]:
# Controls the web browser to open the URL
driver.get(url)

In [22]:
# Parse the HTML content of the web page using BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Print the HTML content of the web page
print(soup)

<html data-rh="lang,translate" lang="id" translate="no"><head>
<title>Jual kabel lan | Tokopedia</title>
<meta charset="utf-8"/>
<meta content="initial-scale=1, minimum-scale=1, maximum-scale=5, user-scalable=yes, width=device-width" name="viewport"/>
<link crossorigin="" href="https://assets.tokopedia.net" rel="preconnect"/><link href="https://assets.tokopedia.net" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://gql.tokopedia.com" rel="preconnect"/><link href="https://gql.tokopedia.com" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://googleads.g.doubleclick.net" rel="preconnect"/><link href="https://googleads.g.doubleclick.net" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://www.google-analytics.com" rel="preconnect"/><link href="https://www.google-analytics.com" nonce="" rel="dns-prefetch"/>
<link crossorigin="" href="https://images.tokopedia.net" rel="preconnect"/><link href="https://images.tokopedia.net" nonce="" rel="dns-prefetch"

In [23]:
# data is a variable that contains the result of the scraping process
data: list = []

In [24]:
def scrape_data(item, htmlEle: str, className: str, className2: str = ""):
    """
    scrape_data is a function that scrapes the data from the web page
    based on the class name of the element

    Parameters
    ----------
    - item: The item that the program will scrape
    - htmlEle: The HTML element of the data
    - className: The class name of the data
    - className2: The second class name of the data

    Returns
    -------
    The text of the data if the data is found
    """

    # data is a variable that contains the data that the program will scrape
    data = item.find(htmlEle, class_=className) or item.find(htmlEle, class_=className2)

    # Return the text of the data if the data is found
    if data is not None:
        return data.text

    return ""

In [25]:
def extract_item_data(item) -> list:
    """
    extract_item_data is a function that extracts the data of an item on the web page

    Parameters
    ----------
    - item: The item that the function will extract the data from

    Returns
    -------
    A list that contains the data of the item
    """

    # product_name is a variable that contains the name of the product
    product_name = scrape_data(
        item=item, htmlEle="span", className="_0T8-iGxMpV6NEsYEhwkqEg=="
    )

    # product_price is a variable that contains the price of the product
    product_price = scrape_data(
        item=item, htmlEle="div", className="_67d6E1xDKIzw+i2D2L0tjw=="
    )

    # product_discount is a variable that contains the discount of the product
    product_discount = scrape_data(
        item=item,
        htmlEle="span",
        className="vRrrC5GSv6FRRkbCqM7QcQ==",
        className2="_4bmCSEkQ972HG5f0btO-HQ==",
    )

    # product_rating is a variable that contains the rating of the product
    product_rating = scrape_data(
        item=item, htmlEle="span", className="_9jWGz3C-GX7Myq-32zWG9w=="
    )

    # product_sold is a variable that contains the number of products sold
    product_sold = scrape_data(
        item=item,
        htmlEle="span",
        className="se8WAnkjbVXZNA8mT+Veuw==",
    )

    # In this case, sold supplier name and supplier location have the same classname
    # for the same element. So, we need to find all the elements with the same classname.
    #
    # The result of the find_all method is a list of elements. With this list, we can determine
    # which property the element holds using index.
    # In this case, the first element is the supplier name and the second element is the supplier location.

    # supplier_name is a variable that contains the name of the supplier
    supplier_name = item.findAll("span", class_="pC8DMVkBZGW7-egObcWMFQ==")[0].text

    # supplier_location is a variable that contains the location of the supplier
    supplier_location = item.findAll("span", class_="pC8DMVkBZGW7-egObcWMFQ==")[1].text

    return [
        product_name,
        product_price,
        product_discount,
        product_rating,
        product_sold,
        supplier_name,
        supplier_location,
    ]

In [26]:
# Loop through the number of pages that the program will scrape
for page in range(page_total_to_scrape):
    # Waits for the web page to load completely for a certain amount of time
    # The web page is considered loaded when the element with the class name "zeus-root" is present
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#zeus-root"))
    )

    # Wait for a few seconds before scrapping data from web page
    time.sleep(timeout)

    # Loop through the number of rows that the program will scrape
    for rows in range(rows_to_scrolls):
        # Execute JavaScript to scroll to the bottom of the page
        driver.execute_script("window.scrollBy(0, 250);")

        # Wait for a few seconds before continue to next page
        time.sleep(1)

    driver.execute_script("window.scrollBy(50, 0);")

    # Wait for a few seconds before continue to next step
    time.sleep(1)

    # Parse the HTML content of the web page using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Loop through the items on the web page
    for item in soup.findAll("div", class_="bYD8FcVCFyOBiVyITwDj1Q=="):
        # When scrapping data from the web page, there is a possibility there is a
        # recommended product card without location and supplier name.
        #
        # To prevent the program from crashing, we need to add a try-except block
        try:
            # Call the extract_item_data function to extract Fthe data of the item
            extract_result = extract_item_data(item)

            # Append the result of the scraping process to the data variable
            data.append(extract_result)
        except Exception as e:
            continue

    # Wait for a few seconds before continue to next page
    time.sleep(2)

    # Procceed to the next page by clicking the next page button
    driver.find_element(
        By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']"
    ).click()

    # Wait for a few seconds before contining proccess in the next page
    time.sleep(3)

In [27]:
# df is a variable that contains the result of the scraping process in a tabular format
# The tabular format makes it easier to analyze the data
df = pd.DataFrame(
    data,
    columns=[
        "Product Name",
        "Price",
        "Discount",
        "Rating",
        "Sold",
        "Supplier Name",
        "Location",
    ],
)

# Print the result of the scraping process
df

Unnamed: 0,Product Name,Price,Discount,Rating,Sold,Supplier Name,Location
0,VENTION Kabel LAN RJ45 Cat.6 Gigabit Ethernet ...,Rp12.600,10%,5.0,10rb+ terjual,Vention Authorized Store,Jakarta Barat
1,DT - Vention Kabel LAN RJ45 Cat.6 Gigabit Ethe...,Rp44.730,10%,5.0,7rb+ terjual,Vention Authorized Store,Jakarta Barat
2,Vention Kabel LAN Cat6 - UTP Gigabit Ethernet ...,Rp14.600,,5.0,100+ terjual,idos thing,Jakarta Timur
3,BELDEN 1583A Kabel LAN UTP Cat-5E ORIGINAL 305...,Rp1.100.000,,4.9,3rb+ terjual,Pusat Jaringan Nusantara,Jakarta Pusat
4,BELDEN 7814A Kabel LAN UTP Cat. 6 Roll/Box/300M,Rp2.090.000,,5.0,3rb+ terjual,SANJAYA COMTRONIX,Jakarta Pusat
...,...,...,...,...,...,...,...
250,[15M - VPC7] Vention Kabel LAN RJ45 Cat.7 SSTP...,Rp217.200,,5.0,250+ terjual,Inprotek,Jakarta Utara
251,C620W - PATCH CORD 20 METER UTP CAT 6 KABEL LA...,Rp142.500,25%,5.0,14 terjual,Nusa Rackindo,Jakarta Pusat
252,UGREEN Barel RJ45 Sambungan Kabel Lan Cat5 Cat...,Rp40.000,60%,5.0,500+ terjual,UGREEN Official Store,Jakarta Pusat
253,C63B - PATCH CORD 3 METER UTP CAT 6 KABEL LAN ...,Rp25.500,25%,5.0,1rb+ terjual,Nusa Rackindo,Jakarta Pusat


In [28]:
def pre_process_price(row) -> int:
    """
    pre_process_price is a function that preprocesses the price of the product

    Parameters
    ----------
    - row: The row that the function will preprocess the price

    Returns
    -------
    The preprocessed price of the product
    """

    # Remove the currency symbol from the price
    return int(row["Price"].replace("Rp", "").replace(".", ""))

In [29]:
def pre_process_discount(row) -> int:
    """
    pre_process_discount is a function that preprocesses the discount data of the product

    Parameters
    ----------
    - row: The row that contains the discount data of the product

    Returns
    -------
    The price after discount
    """

    # Check if the discount is in percentage
    if "%" in row["Discount"]:
        # Remove the "%" text and calculate the price after discount
        percentage = int(row["Discount"].replace("%", ""))

        # Calculate the price after discount
        return int(row["Price"] / (1 - (percentage / 100)))
    elif "Diskon " in row["Discount"]:
        # Remove "Diskon " text and replace the "rb" and "jt" text with the number
        return (
            row["Discount"]
            .replace("Diskon ", "")
            .replace("rb", "000" * 3)
            .replace("jt", "000" * 6)
        )

    return 0

In [30]:
def pre_process_sold(row) -> int:
    """
    pre_process_sold is a function that preprocesses the number of products sold

    Parameters
    ----------
    - row: The row that contains the number of products sold

    Returns
    -------
    The number of products sold
    """

    # Remove the "Terjual " text and replace the "+" text with an empty string
    # Replace the "rb" text with the number and replace the "jt" text with the number
    sold = (
        row["Sold"]
        .replace(" terjual", "")
        .replace("+", "")
        .replace("rb", "000" * 3)
        .replace("jt", "000" * 6)
    )

    # Check if the number of products sold is not an empty string
    if not sold == "":
        # Convert the number of products sold to an integer
        return int(sold)

    return 0

In [31]:
def pre_process(frame: pd.DataFrame) -> pd.DataFrame:
    """
    pre_process is a function that preprocesses the data before saving it to a CSV file

    Parameters
    ----------
    - frame: The data frame that the function will preprocess

    Returns
    -------
    None
    """

    # Preprocess the "Price" column
    frame["Price"] = frame.apply(pre_process_price, axis=1)

    # Preprocess the "Discount" column
    frame["Discount"] = frame.apply(pre_process_discount, axis=1)

    # Preprocess the "Sold" column
    frame["Sold"] = frame.apply(pre_process_sold, axis=1)

    return df

In [32]:
# Call the pre_process function to preprocess the data
df = pre_process(frame=df)

# Print the result of the preprocessing process
#df

In [33]:
# Save the result of the scraping process to a CSV file
df.to_csv(path_or_buf=csv_file_path, index=False)

In [34]:
# Close the web browser after the scraping process is completed
driver.close()