In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import requests
import numpy as np
import pandas as pd
import re

## Using Selenium with Chrome 

In [4]:
chrome_opts = Options()
chrome_opts.headless = True

driver = webdriver.Chrome(
    executable_path='./driver/chromedriver',
    options=chrome_opts
)

## DEFINE SOME CONST

In [5]:
url_van_phong = "https://phongvu.vn/laptop-van-phong-718.cat"
url_gaming = "https://phongvu.vn/laptop-choi-game-716.cat"
url_do_hoa = "https://phongvu.vn/laptop-do-hoa-1876.cat"

## CRAWL FUNCTION

In [6]:
def crawl_meta(url: str, category: str) -> list:
    page = 1
    max_page = 1
    laptops = list()
    
    while page <= max_page:
        driver.get(f"{url}?p={page}")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.pagination'))
        )
        elements = driver.find_elements_by_css_selector(".product-item")
        max_page_el = driver.find_element_by_css_selector('.pagination .v-pagination__item:last-child')
        if (max_page_el and max_page_el.text):
            max_page = int(max_page_el.text)
        
        for el in elements:
            try:
                sku = el.get_attribute('data-content-piece')
                link = el.get_attribute('href')
                name = el.find_element_by_css_selector('.product-name').text.strip()
                
                laptops.append({
                    'id': sku,
                    'link': link,
                    'name': name,
                    'category': category
                })
            except AttributeError:
                continue
                
        page += 1

    return laptops

Crawl detail using PhongVu API: `https://listing.services.teko.vn/api/products/{id}?channel=pv_online&terminal=phongvu`

In [22]:
def crawl_detail(meta: dict) -> dict:
    id = meta.get('id')
    resp = requests.get(f"https://listing.services.teko.vn/api/products/{id}?channel=pv_online&terminal=phongvu")
    json = resp.json()
    
    product = json['result']['product']
    name = product['name']
    attributes = product['attributes']
    
    detail = {
        "name": name
    }
    
    for attr in attributes:
        attr_name = attr['code']
        values = attr['values']
        value = values[0]['value'] if len(values) > 0 else None 
        field = {
            "laptop_dophangiaimanhinh": "resolution",
            "laptop_tencpu": "cpu",
            "laptop_bonhofilter": "ram",
            "laptop_chipdohoaroi": "graphic",
            "laptop_dungluonghddfilter": "disk",
            "laptop_dungluongssdfilter": "ssd",
            "laptop_dungluongpin": "battery",
            "laptop_khoiluong": "weight",
            "laptop_den": "keyboard_light"
        }.get(attr_name)
        
        if field is not None:
            detail[field] = value
        
    detail.update(meta)
    return detail

## BẮT ĐẦU CRAWL

In [38]:
gaming_m = crawl_meta(url_gaming, 'laptop-gaming')

In [39]:
do_hoa_m = crawl_meta(url_do_hoa, 'do-hoa-ky-thuat')

In [40]:
van_phong_m = crawl_meta(url_van_phong, 'hoc-tap-van-phong')

In [41]:
laptop_m = np.concatenate((gaming_m, do_hoa_m, van_phong_m))

In [42]:
len(laptop_m)

60

In [43]:
from multiprocessing import Pool

In [44]:
pool = Pool(processes=8)
detail_list = pool.map(crawl_detail, laptop_m)

In [45]:
len(detail_list)

60

## SAVE DATA

In [46]:
df: pd.DataFrame = pd.DataFrame(detail_list)
df = df.replace(to_replace="KHT", value="")
df['keyboard_light'] = df['keyboard_light'].apply(lambda k: 1 if k in ['RGB', 'LED'] else 0)

def get_cpu_name(cpu: str):
    split = cpu.split('-')
    if len(split) < 2:
        return re.sub(r'\d+H', '', cpu)
    return split[0]
def get_cpu_type(cpu: str):
    split = cpu.split('-')
    if len(split) > 1:
        return split[1]
    search = re.search(r'(\d+[HU])', split[0])
    return search.group(1) if search else ''

df['cpu_name'] = df['cpu'].apply(get_cpu_name)
df['cpu_type'] = df['cpu'].apply(get_cpu_type)

df.to_csv('phong_vu.csv', index=False)
df.sort_values('cpu_type').head(5)

Unnamed: 0,name,resolution,cpu,ram,graphic,ssd,disk,battery,weight,keyboard_light,id,link,category,cpu_name,cpu_type
22,"Laptop Apple MacBook Pro 2019 MV932 (15"" 2880x...",2880 x 1800,Intel Core i9,16GB,AMD Radeon Pro 560X,512GB,,,1.8,1,190900029,https://phongvu.vn/may-tinh-xach-tay-laptop-ma...,do-hoa-ky-thuat,Intel Core i9,
39,"Laptop ASUS ZenBook Duo UX481FL-BM048T (14"" FH...",1920 x 1080,Intel Core i5-10210U,8GB,NVIDIA GeForce MX250,512GB,,4 cell,1.6,0,191007143,https://phongvu.vn/may-tinh-xach-tay-laptop-as...,do-hoa-ky-thuat,Intel Core i5,10210U
42,Laptop Lenovo Ideapad S145-15IWL (81MV00F1VN) ...,1920 x 1080,Intel Celeron 4205U,4GB,,,500GB,2 cell,1.9,0,190800714,https://phongvu.vn/may-tinh-xach-tay-laptop-le...,hoc-tap-van-phong,Intel Celeron 4205U,4205U
59,Laptop Acer Aspire 3 A315-54K-32SD (NX.HFXSV.0...,1920 x 1080,Intel Core i3-7020U,4GB,,256GB,,2 cell,1.7,0,191000364,https://phongvu.vn/may-tinh-xach-tay-laptop-ac...,hoc-tap-van-phong,Intel Core i3,7020U
48,"Laptop ASUS 14 X409UA-EK092T (14"" FHD/i3-7020U...",1920 x 1080,Intel Core i3-7020U,4GB,,,1TB,2 cell,1.6,0,190900033,https://phongvu.vn/may-tinh-xach-tay-laptop-as...,hoc-tap-van-phong,Intel Core i3,7020U
