# Web Scraping - Shopee Seller Identity

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

In [None]:
%pip freeze > requirements.txt

In [12]:
# Modules for web scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# URL encoding
from requests.utils import requote_uri
# Module for data manipulation
import pandas as pd
# Module for regular expression
import re
# Module for file management
import os
# Module for timing
import time

## Load the Chromedriver

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

**Options**

In [13]:
# Options for Chrome windows
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('window-size=2560,1440')

**Chromedriver path**

In [14]:
DRIVER_PATH = '../bin/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH, options = options)

## Core Procedure

**URL and Query**

In [143]:
# Main URL
url = 'https://shopee.co.id/search?keyword='
# Keyword search
seller = 'headset'
# Concat the keyword into main URL
query_encode = requote_uri(seller.strip('').lower())
url_query = url + query_encode

In [144]:
# Full URL with keyword
url_query

'https://shopee.co.id/search?keyword=headset'

In [145]:
# Root directory
dir_path = os.getcwd()

**Access website**

In [11]:
driver.get(url_query)
driver.implicitly_wait(20)

NameError: name 'driver' is not defined

In [147]:
# Document scrollHeight
scroll_height = driver.execute_script('return document.documentElement.scrollHeight;')
scroll_height

656

In [148]:
# Scroll down slowly
scroll_height = driver.execute_script('return document.documentElement.scrollHeight;')
for iters in range(0, scroll_height, 200):
    driver.execute_script('window.scrollTo(0, {});'.format(iters))
    time.sleep(1)

In [149]:
# Get the div of URL
product_list = driver.find_elements_by_class_name('row.shopee-search-item-result__items')

In [150]:
product_list[0]

<selenium.webdriver.remote.webelement.WebElement (session="7632babd2d34e0109a50c47d03ae43c1", element="87783a2e-93fb-43ca-bd5b-2a7c530bcf35")>

In [96]:
# Get the URL by looping the elements
product_links = []
links = product_list[0].find_elements_by_tag_name('a')
for elem in links:
    link = elem.get_attribute('href')
    product_links.append(link)

In [97]:
product_links

['https://shopee.co.id/Oraimo-Headset-In-Ear-Earphone-Handsfree-dengan-Mic-Universal-3.5mm-IOS-Android-OEP-E21N-i.148920182.5046658694',
 'https://shopee.co.id/Headset-Handsfree-U19-macaron-Earphone-Macaroon-Mate-Color-Hifi-Extra-Bass-i.27806949.6846699359',
 'https://shopee.co.id/Handsfree-VMT-108-Sound-Earphone-i.163773779.4249635263',
 'https://shopee.co.id/HK-_-Headset-Philips-AT-036-Handsfree-Earphone-Philips-AT036-Extra-Bass-i.234553399.8442711535',
 'https://shopee.co.id/Oraimo-Sport-Bluetooth-Wireless-Headset-Sweatproof-Earphone-Handsfree-OEB-E60DN-i.148920182.7745495457',
 'https://shopee.co.id/U19-MACARON-handsfree-headset-earphone-macaron-hifi-extra-bass-matte-colour-i.26479047.6251981164',
 'https://shopee.co.id/HANSFREE-WARNA-WARNI-JACK-3.5-HEADSET-MACARON-EXTRA-BASS-MR.ACC-MA10-BC-i.37847740.7455891086',
 'https://shopee.co.id/Headset-Macaron-U19-Stereo-Handsfree-Earphone-Macaron-U19-Colorfull-Warna-Warni-Pastel-Original-i.5613134.5866006438',
 'https://shopee.co.id/Heads

In [98]:
len(product_links)

50

In [99]:
page = 1
while len(product_links) < 100:
    url_loops = url_query + '&page={}'.format(page)
    # Access to the URL
    driver.get(url_loops)
    driver.implicitly_wait(20)
    # Scroll down the page
    scroll_height = driver.execute_script('return document.documentElement.scrollHeight;') # Document scrollHeight
    # Scroll down slowly
    scroll_height = driver.execute_script('return document.documentElement.scrollHeight;')
    for iters in range(0, scroll_height, 200):
        driver.execute_script('window.scrollTo(0, {});'.format(iters))
        time.sleep(1)
    # Get the div of URL
    product_list = driver.find_elements_by_class_name('row.shopee-search-item-result__items')
    # Get the URL by looping the elements
    links = product_list[0].find_elements_by_tag_name('a')
    for elem in links:
        link = elem.get_attribute('href')
        product_links.append(link)
    # Add the page
    page += 1

In [100]:
len(product_links)

115

**Seller identity**
- **Rating** (seller page)
  - *Numeric*
- **Seller category** (product page)
  - *Star seller*
  - *Mall*
- **Product category** (product page)
  - *e.g., toys, food, architecture*
- **Seller's Address** (product page)
  - *e.g., Jakarta Pusat*
- **Length of stay** (product page)
  - *In year or month
- **Chat response** (product page)
  - *Percentage*
- **Number of followers** (product page)
  - *Numeric*
- **Number of product** (product page)
  - *Numeric*

In [29]:
# Loop the URL to get the seller identity
driver.get('https://shopee.co.id/%E2%98%85%E2%98%85%E2%98%85Laptop-i5-RAM-8GB-500GB-1TB-SSD-256GB-SSD-512GB-14-DVD-Laptop-Dell-Core-i5-Bekas-i.20507921.1213381268')
driver.implicitly_wait(20)

In [34]:
driver.execute_script('window.scrollTo(0, {});'.format(1000))

In [16]:
# Username
driver.find_element_by_class_name('_3uf2ae').text

'twentycell'

In [17]:
# Number of reviewers
driver.find_elements_by_class_name('ssFdmZ')[0].find_elements_by_tag_name('span')[0].text

'7,7RB'

In [18]:
# Number of products
driver.find_elements_by_class_name('ssFdmZ')[0].find_elements_by_tag_name('span')[1].text

'44'

In [19]:
# Chat response - percentage
driver.find_elements_by_class_name('ssFdmZ')[1].find_elements_by_tag_name('span')[0].text

'3%'

In [20]:
# Chat response - category
driver.find_elements_by_class_name('ssFdmZ')[1].find_elements_by_tag_name('span')[1].text

'hitungan jam'

In [21]:
# Length of stay
driver.find_elements_by_class_name('ssFdmZ')[2].find_elements_by_tag_name('span')[0].text

'4 tahun lalu'

In [22]:
# Number of followers
driver.find_elements_by_class_name('ssFdmZ')[2].find_elements_by_tag_name('span')[1].text

'66,5RB'

In [23]:
# Seller's category
driver.find_elements_by_class_name('aPKXeO')[0].find_elements_by_tag_name('div')[0].find_elements_by_tag_name('a')[1].text

'Komputer & Aksesoris'

In [24]:
# Seller's address
length_element = len(driver.find_elements_by_class_name('aPKXeO'))
driver.find_elements_by_class_name('aPKXeO')[length_element - 1].find_elements_by_tag_name('div')[0].text

'KOTA JAKARTA UTARA - PADEMANGAN, DKI JAKARTA, ID'

In [25]:
# Seller's status
try:
    stat = driver.find_element_by_class_name('SK--cp').find_elements_by_tag_name('div')[0].text
    if stat == '':
        stat = driver.find_element_by_class_name('SK--cp').find_elements_by_class_name('official-shop-new-badge')[0]
        if stat is not None:
            print('Shopee Mall')
except:
    print('Regular')

Regular


In [26]:
# URL into seller page
seller_page = driver.find_element_by_class_name('btn.btn-light.btn--s.btn--inline.btn-light--link._3IQTrY')
seller_page.get_attribute('href')

'https://shopee.co.id/twentycell?categoryId=134&itemId=1213381268'

**New Tab**

In [135]:
# Determine the main window
main_window = driver.current_window_handle

In [136]:
driver.execute_script('window.open();')

In [137]:
driver.switch_to.window(driver.window_handles[1])

In [27]:
driver.get('https://shopee.co.id/twentycell?categoryId=134&itemId=1213381268')

In [28]:
# Seller's rating
driver.find_elements_by_class_name('section-seller-overview__item-text-value')[8].text

'4.7 (7,7RB Penilaian)'

In [140]:
driver.close()

In [39]:
# Back to the main window
driver.switch_to.window(main_window)
driver.get(url_query)

**Keyword Data**

In [77]:
df = pd.read_csv('../data/raw/keyword_list.csv', sep = ';')

In [78]:
df.head()

Unnamed: 0,keyword,length
0,headset,20
1,laptop,30
2,baju,50


In [57]:
df['keyword'][0]

'headset'

In [58]:
df['length'][0]

20

**Check the Username**

In [63]:
from bisect import bisect_left
import bisect

In [61]:
def BinSearch(a, x):
    elem = bisect_left(a, x)
    if elem != len(a) and a[elem] == x:
        return True
    else:
        return False

In [59]:
usernameList = []
usernameSeller = 'username'

In [62]:
BinSearch(usernameList, usernameSeller)

False

In [68]:
bisect.insort(usernameList, usernameSeller)

In [69]:
usernameList

['username', 'username0', 'username1']

In [118]:
pd.DataFrame({'A': [1], 'B': [2], 'C': ['elem']})

Unnamed: 0,A,B,C
0,1,2,elem


**Looping mechanism**

In [21]:
index = 0
while (lengthData + index) < 3:
    index += 1
    print(index)

1
2
3


In [10]:
index = 0
indexURL = 0
while index < 10:
    index += 1
    indexURL += 1
    print(index)

1
2
3
4
5
6
7
8
9
10


In [4]:
lengthData = 0
while lengthData < 3:
    linkLoop = []
    while (lengthData + len(linkLoop)) < 3:
        linkLoop.append(str((lengthData + len(linkLoop))))
        # index += 1
        print(index)
    lengthData = (lengthData + index)
    print(lengthData)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


KeyboardInterrupt: 

In [31]:
lengthData

3