In [37]:
# Import the required libraries
# webdriver is a library that allows to control a web browser
from selenium import webdriver

# BeautifulSoup is a library that makes it easy to scrape information from web pages
from bs4 import BeautifulSoup

# time is a library that allows to pause the program for a certain amount of time
import time

# configparser is a module to work with configuration files
import configparser

In [38]:
# config is an object of RawConfigParser class
config = configparser.RawConfigParser()

# Read the configuration file
config.read("app.properties")

['app.properties']

In [39]:
# url is a variable that contains the URL of the web page that the program will scrape
#
# To update the path of the CSV file path, update the value of csv.path in app.properties file
# specificly in the scrapper.url at ScrapperSection section
url: str = config.get("ScrapperSection", "scrapper.url")

In [40]:
# rows_to_scrolls is a variable that contains the number of rows that the program will scroll
#
# To update the path of the CSV file path, update the value of csv.path in app.properties file
# specificly in the scrapper.rows_to_scroll at ScrapperSection section
rows_to_scrolls: int = int(config.get("ScrapperSection", "scrapper.rows_to_scroll"))

In [41]:
# csv_file_path is a variable that contains the path of the CSV file that the program will create
# to store the result of scraped data
#
# To update the path of the CSV file path, update the value of csv.path in app.properties file
# specificly in the csv.path at CSVSection section
csv_file_path: str = config.get("CSVSection", "csv.path")

In [42]:
# driver is a variable that contains the web browser that the program will control
# In this case, the program will open the web browser automatically
#
# Make sure to replace the web browser with the web browser that
# you have installed on your computer
driver = webdriver.Firefox()

In [43]:
# Controls the web browser to open the URL
driver.get(url)

In [44]:
# Parse the HTML content of the web page using BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Print the HTML content of the web page
print(soup)

<html class="responsive" lang="id"><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="#171a21" name="theme-color"/>
<title>Komunitas Steam :: Apex Legends</title>
<link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="https://community.fastly.steamstatic.com/public/shared/css/motiva_sans.css?v=-yZgCk0Nu7kH&amp;l=indonesian&amp;_cdn=fastly" rel="stylesheet" type="text/css"/>
<link href="https://community.fastly.steamstatic.com/public/shared/css/buttons.css?v=qhQgyjWi6LgJ&amp;l=indonesian&amp;_cdn=fastly" rel="stylesheet" type="text/css"/>
<link href="https://community.fastly.steamstatic.com/public/shared/css/shared_global.css?v=wuA4X_n5-mo0&amp;l=indonesian&amp;_cdn=fastly" rel="stylesheet" type="text/css"/>
<link href="https://community.fastly.steamstatic.com/public/css/globalv2.css?v=hzEgqbtRcI5V&amp;l=indonesian&amp;_cdn=fastly" rel="stylesheet" type="

In [45]:
# data is a variable that contains the result of the scraping process
data: list = []

In [46]:
def extract_data(review) -> list:
    """
    extract_item_data is a function that extracts the data of an item on the web page

    Parameters
    ----------
    - item: The item that the function will extract the data from

    Returns
    -------
    A list that contains the data of the item
    """

    # review_recommended is a variable that contains the recommendation of the review
    review_recommended = review.find("div", class_="title").text

    # review_date is a variable that contains the date of the review
    review_date = review.find("div", class_="date_posted").text

    # review_rating is a variable that contains the rating of the review
    review_message = review.find("div", class_="apphub_CardTextContent").text

    return [
        review_message.replace(review_date, ""),
        (
            "positive"
            if review_recommended == "Recommended"
            or review_recommended == "Direkomendasikan"
            else "negative"
        ),
    ]

In [47]:
# Loop through the number of rows that the program will scrape
for rows in range(rows_to_scrolls):
    # Execute JavaScript to scroll to the bottom of the page
    driver.execute_script("window.scrollBy(0, 250);")

    # Wait for a few seconds before continue to next page
    time.sleep(3)

# Parse the HTML content of the web page using BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# reviews is a variable that contains the product reviews
reviews = soup.findAll("div", class_="apphub_CardContentMain")

# Loop through the product reviews
for review in reviews:
    # There is a possibility that the program will encounter an error
    # for example when the program cannot find the element that it is looking for
    # especially when the product review is not available
    try:
        # Extract the data of the product review
        extracted_data = extract_data(review)

        # Append the extracted data to the data variable
        data.append(extracted_data)
    except Exception as e:
        continue

In [48]:
# Close the web browser after the scraping process is completed
driver.close()

In [49]:
# pandas is a library that allows to work with data in a tabular format
import pandas as pd

In [50]:
# df is a variable that contains the result of the scraping process in a tabular format
# The tabular format makes it easier to analyze the data
df = pd.DataFrame(
    data,
    columns=[
        "Review",
        "Label"
    ],
)

# Print the result of the scraping process
df

Unnamed: 0,Review,Label
0,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t2 10 dolla...,negative
1,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tNow with t...,negative
2,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAt least y...,positive
3,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFinally I ...,positive
4,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFinally he...,positive
...,...,...
1015,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAfter play...,positive
1016,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tI have no ...,negative
1017,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tHot new em...,positive
1018,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tthese scum...,negative


In [51]:
# Save the result of the scraping process to a CSV file
df.to_csv(path_or_buf=csv_file_path, index=False)