In [48]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import requests
import numpy as np
import pandas as pd

## Using Selenium with Chrome 

In [21]:
chrome_opts = Options()
chrome_opts.headless = True
driver = webdriver.Chrome(
    executable_path='./driver/chromedriver',
    options=chrome_opts
)

## DEFINE SOME CONST

In [6]:
url_van_phong = "https://phongvu.vn/laptop-van-phong-718.cat"
url_gaming = "https://phongvu.vn/laptop-choi-game-716.cat"
url_do_hoa = "https://phongvu.vn/laptop-do-hoa-1876.cat"

## CRAWL FUNCTION

In [19]:
def crawl_meta(url: str, category: str) -> list:
    page = 1
    max_page = 1
    laptops = list()
    
    while page <= max_page:
        driver.get(f"{url}?p={page}")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.pagination'))
        )
        elements = driver.find_elements_by_css_selector(".product-item")
        max_page_el = driver.find_element_by_css_selector('.pagination .v-pagination__item:last-child')
        if (max_page_el and max_page_el.text):
            max_page = int(max_page_el.text)
        
        for el in elements:
            try:
                sku = el.get_attribute('data-content-piece')
                link = el.get_attribute('href')
                name = el.find_element_by_css_selector('.product-name').text.strip()
                
                laptops.append({
                    'id': sku,
                    'link': link,
                    'name': name,
                    'category': category
                })
            except AttributeError:
                continue
                
        page += 1

    return laptops

Crawl detail using PhongVu API: `https://listing.services.teko.vn/api/products/{id}?channel=pv_online&terminal=phongvu`

In [42]:
def crawl_detail(meta: dict) -> dict:
    id = meta.get('id')
    resp = requests.get(f"https://listing.services.teko.vn/api/products/{id}?channel=pv_online&terminal=phongvu")
    json = resp.json()
    
    product = json['result']['product']
    name = product['name']
    attributes = product['attributes']
    
    detail = {
        "name": name
    }
    
    for attr in attributes:
        attr_name = attr['code']
        values = attr['values']
        value = values[0]['value'] if len(values) > 0 else None 
        field = {
            "laptop_dophangiaimanhinh": "resolution",
            "laptop_tencpu": "cpu",
            "laptop_bonhofilter": "ram",
            "laptop_chipdohoaroi": "graphic",
            "laptop_dungluonghddfilter": "disk",
            "laptop_dungluongssdfilter": "ssd",
            "laptop_dungluongpin": "battery",
            "laptop_khoiluong": "weight",
            "laptop_den": "keyboard_light"
        }.get(attr_name)
        
        if field is not None:
            if field == "cpu":
                cpu_split = value.split('-')
                detail['cpu_name'] = cpu_split[0] if len(cpu_split) > 0 else ''
                detail['cpu_type'] = cpu_split[1] if len(cpu_split) > 1 else ''
            else:
                detail[field] = value
        
    detail.update(meta)
    return detail

## BẮT ĐẦU CRAWL

In [22]:
gaming_m = crawl_meta(url_gaming, 'gaming')

In [23]:
do_hoa_m = crawl_meta(url_do_hoa, 'do-hoa')

In [24]:
van_phong_m = crawl_meta(url_van_phong, 'van-phong')

In [25]:
laptop_m = np.concatenate((gaming_m, do_hoa_m, van_phong_m))

In [33]:
len(laptop_m)

60

In [30]:
from multiprocessing import Pool

In [43]:
pool = Pool(processes=8)
detail_list = pool.map(crawl_detail, laptop_m)

In [47]:
len(detail_list)

60

## SAVE DATA

In [55]:
df: pd.DataFrame = pd.DataFrame(detail_list)
df = df.replace(to_replace="KHT", value="")
df['keyboard_light'] = df['keyboard_light'].apply(lambda k: 1 if k in ['RGB', 'LED'] else 0)
df.to_csv('phong_vu.csv', index=False)
df.head()

Unnamed: 0,name,resolution,cpu_name,cpu_type,ram,graphic,ssd,disk,battery,weight,keyboard_light,id,link
0,Laptop ASUS ROG Strix Scar III G531GV-VAZ160T ...,1920 x 1080,Intel Core i7,9750H,16GB,NVIDIA GeForce RTX 2060,512GB,,4 cell,2.6,1,190800596,https://phongvu.vn/may-tinh-xach-tay-laptop-as...
1,"Laptop ASUS ROG Zephyrus M GU502GU-AZ090T (15""...",1920 x 1080,Intel Core i7,9750H,16GB,NVIDIA GeForce GTX 1660Ti,512GB,,4 cell,2.0,1,190800597,https://phongvu.vn/may-tinh-xach-tay-laptop-as...
2,Laptop HP Pavilion Gaming 15-dk0003TX (7HR34PA...,1920 x 1080,Intel Core i7,9750H,16GB,NVIDIA GeForce GTX 1660Ti,512GB,1TB,3 cell,2.2,0,190900035,https://phongvu.vn/may-tinh-xach-tay-laptop-hp...
3,Laptop HP Pavilion Gaming 15-dk0233TX (8DS86PA...,1920 x 1080,Intel Core i7,9750H,8GB,NVIDIA GeForce GTX 1650,512GB,,3 cell,2.2,0,190900036,https://phongvu.vn/may-tinh-xach-tay-laptop-hp...
4,Laptop HP Pavilion Gaming 15-dk0232TX (8DS85PA...,1920 x 1080,Intel Core i7,9750H,8GB,NVIDIA GeForce GTX 1650,,1TB,3 cell,2.2,0,190900037,https://phongvu.vn/may-tinh-xach-tay-laptop-hp...
