In [1]:
!pip install pandas
!pip install selenium
!pip install urllib3

Collecting selenium
  Downloading selenium-4.10.0-py3-none-any.whl (6.7 MB)
     ---------------------------------------- 6.7/6.7 MB 1.1 MB/s eta 0:00:00
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
     -------------------------------------- 384.9/384.9 kB 1.0 MB/s eta 0:00:00
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.10.3-py3-none-any.whl (17 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.1-py3-none-any.whl (14 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.3/58.3 kB 3.0 MB/s eta 0:00:00
Installing collected packages: outcome, h11, exceptiongroup, async-generator, wsproto, trio, trio-webs

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
import urllib.parse
import re
import time
import pandas as pd

In [4]:
class Tokopedia:
    def __init__(self, headless=True) -> None:
        self.driver = self.setup(headless)
        self.data = []

    def setup(self, headless):
        opt = webdriver.ChromeOptions()
        opt.add_experimental_option('excludeSwitches', ['enable-logging'])
        if headless:
            opt.add_argument("--headless")
            opt.add_argument("--window-size=2560,1440")
            opt.add_argument('--ignore-certificate-errors')
            opt.add_argument('--allow-running-insecure-content')
            opt.add_argument("--disable-extensions")
            opt.add_argument("--proxy-server='direct://'")
            opt.add_argument("--proxy-bypass-list=*")
            opt.add_argument("--start-maximized")
            opt.add_argument('--headless')
            opt.add_argument('--disable-gpu')
            opt.add_argument('--disable-dev-shm-usage')
            opt.add_argument('--no-sandbox')
            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
            opt.add_argument(f'user-agent={user_agent}')

        return webdriver.Chrome(options=opt)

    def get_details(self, detail_container, category, rank):
        detail = dict()
        detail['rank'] = rank
        detail['category'] = category
        # Name
        try:
            name = detail_container.find_element(
                By.XPATH, ".//div[@data-testid='spnSRPProdName']").get_attribute("innerHTML")
            detail['name'] = name
        except Exception as e:
            detail['name'] = None

        # Price
        try:
            price = detail_container.find_element(
                By.XPATH, ".//div[@data-testid='spnSRPProdPrice']").get_attribute("innerHTML")
            price = float(re.sub('[^0-9]', '', price))
            detail['price'] = price
        except Exception as e:
            ...

        # Location
        try:
            location = detail_container.find_element(
                By.XPATH, ".//span[@data-testid='spnSRPProdTabShopLoc']").get_attribute("innerHTML")
            detail['location'] = location
        except Exception as e:
            detail['location'] = None

        # Rating
        try:
            rating = detail_container.find_element(By.XPATH, ".//*[contains(text(),'Terjual')]").find_element(
                By.XPATH, "preceding-sibling::span[2]").get_attribute("innerHTML")
            rating = float(rating)
            detail['rating'] = rating
        except Exception as e:
            detail['rating'] = None

        # Sold
        try:
            sold = detail_container.find_element(
                By.XPATH, ".//span[contains(text(),'Terjual')]").get_attribute("innerHTML")
            if ("rb" in sold):
                sold = int(re.sub('[^0-9]', '', sold))
                sold = sold * 1000
            else:
                sold = int(re.sub('[^0-9]', '', sold))
            detail['sold'] = sold
        except Exception as e:
            detail['sold'] = None
        return detail

    def search(self, cat):
        self.data = []

        url_safe_cat = urllib.parse.quote(cat)
        url = f"https://www.tokopedia.com/search?st=product&q={url_safe_cat}"
        print(f'Proses Scraping Sedang Berjalan Untuk Produk: {cat}..')
        self.driver.get(url)

        for i in range(2):
            time.sleep(1)
            containers = WebDriverWait(self.driver, 100).until(EC.presence_of_all_elements_located(
                (By.XPATH, "//div[@data-testid='master-product-card']")))

            for index, container in enumerate(containers):
                detail_container = container.find_element(By.TAG_NAME, "div").find_element(
                    By.TAG_NAME, "div").find_elements(By.XPATH, "./div")[1].find_element(By.TAG_NAME, "a")
                details = self.get_details(detail_container, cat, index)
                try:
                    links = container.find_element(
                        By.XPATH, './/a[contains(@href, "tokopedia.com")]')
                    url = links.get_attribute("href")
                    decoded_uri = urllib.parse.unquote(
                        url).split("?")[0]
                    if "ta.tokopedia.com" in decoded_uri:
                         details['url'] = 'Produk tidak dapat diklik karena Iklan / ADS'
                    else:
                        details['url'] = decoded_uri
                    self.data.append(details)
                except Exception:
                    details['url'] = None
                    
                try:
                    image = container.find_element(
                        By.XPATH, './/img[contains(@src, "images.tokopedia")]')
                    details['image'] = image.get_attribute("src")
                except:
                    details['image'] = None
                    print("here")
                self.driver.execute_script("window.scrollTo(0, 1000);")

        self.data = [dict(t) for t in {tuple(d.items())
                                       for d in self.data} if 'name' in dict(t)]

        return self.data

    def close_connection(self):
        self.driver.close()

In [9]:
if __name__ == '__main__':
    search_query = input("Masukkan kata kunci pencarian: ")
    
    print("Memulai pencarian...")
    tokopedia = Tokopedia(headless=True)
    
    items = tokopedia.search(search_query)
    df = pd.DataFrame(items)  
    df.to_excel(search_query+"hasil.xlsx", index=False)
    
    df = pd.DataFrame(items)  
    df.to_excel(search_query+"scrapping.xlsx", index=False)
    print("Pencarian selesai.")

Masukkan kata kunci pencarian: laptop
Memulai pencarian...
Proses Scraping Sedang Berjalan Untuk Produk: laptop..
Pencarian selesai.


In [10]:
df

Unnamed: 0,rank,category,name,price,location,rating,sold,url,image
0,80,laptop,CORE i7 GEN 8 TERMURAH ! LENOVO THINKPAD X280 ...,3100000.0,Jakarta Barat,4.9,250.0,Produk tidak dapat diklik karena Iklan / ADS,https://images.tokopedia.net/img/cache/200-squ...
1,3,laptop,LAPTOP HP ENVY X360 15 TOUCH CORE I5 1235 RAM ...,9699000.0,Jakarta Utara,5.0,9.0,Produk tidak dapat diklik karena Iklan / ADS,https://images.tokopedia.net/img/cache/200-squ...
2,8,laptop,ASUS VIVOBOOK 14X M1403QA RYZEN 5 5600H 8GB 51...,7989000.0,Jakarta Utara,4.9,1000.0,https://www.tokopedia.com/amd-id/asus-vivobook...,https://images.tokopedia.net/img/cache/200-squ...
3,71,laptop,"ADVAN Soulmate Laptop 14"" Intel N4020 4GB 128G...",2084000.0,Jakarta Barat,4.8,100.0,https://www.tokopedia.com/advan-official/advan...,https://images.tokopedia.net/img/cache/200-squ...
4,37,laptop,Asus Vivobook 14X M1403QA - Ryzen 5 5600H 16GB...,8189000.0,Jakarta Utara,5.0,250.0,https://www.tokopedia.com/toptech/asus-vivoboo...,https://images.tokopedia.net/img/cache/200-squ...
...,...,...,...,...,...,...,...,...,...
77,7,laptop,LAPTOP XIAOMI Redmi Book 15 - Core i3-1115G4 8...,5029000.0,Jakarta Pusat,4.9,500.0,https://www.tokopedia.com/protechcom/laptop-xi...,https://images.tokopedia.net/img/cache/200-squ...
78,75,laptop,LAPTOP HP CORE I5 6300 - RAM 8 GB - SSD M2 128...,2200000.0,Bekasi,5.0,70.0,https://www.tokopedia.com/deparicomputer/lapto...,https://images.tokopedia.net/img/cache/200-squ...
79,62,laptop,ACER Nitro 5 AN515-58 i5-12500H 8/16G 512G RTX...,13219000.0,Jakarta Pusat,4.9,250.0,https://www.tokopedia.com/collinsofficial/acer...,https://images.tokopedia.net/img/cache/200-squ...
80,70,laptop,ASUS VivoBook Ultra 15 OLED K513EA I5-1135G7 8...,10699000.0,Jakarta Pusat,4.9,100.0,https://www.tokopedia.com/collinsofficial/asus...,https://images.tokopedia.net/img/cache/200-squ...
