In [1]:
from requests_html import HTMLSession
import pandas as pd
import re
import os

## GET PRODUCT'S META INFO FUNCTION

In [29]:
def get_products_tiki(html) -> list:
    products_elements = html.find('.product-item')
    product_list = list()

    for product in products_elements:
        title = product.attrs['data-title']
        price = product.attrs['data-price']
        brand = product.attrs['data-brand']
        link = product.find('a', first=True).attrs['href']
        product_list.append({
            'title': title.strip('\t'),
            'brand': brand,
            'price': price,
            'link': link
        })

    return product_list

def get_products_detail_tiki(html) -> dict:
    detail = {}
    chi_tiet = html.find('#chi-tiet tbody tr')
    for info in chi_tiet:
        attrs = info.find('td', first=True).attrs
        if 'rel' not in attrs:
            continue

        field_name = attrs['rel']
        if isinstance(field_name, tuple):
            field_name = field_name[0]

        value = info.find('td.last', first=True).text.strip()
        
        field_name_new = {
            'title': 'name',
            'price': 'price',
            'brand': 'brand',
            'cpu': 'cpu',
            'chipset': 'type_cpu',
            'loai_o_dia': 'ssd',
            'ram': 'ram',
            'product_weight': 'weight',
            'card_man_hinh': 'graphic',
            'loai_pin': 'battery'
        }.get(field_name, None)

        value_filtered = {
            'price': lambda price: re.sub(r'\D+', '', price),
            'ssd': lambda ssd: 'ssd' in ssd.lower(),
            'weight': lambda weight: re.sub(r'[ a-zA-Z]', '', weight)
        }.get(field_name_new, lambda x: x)(value)

        if field_name_new is not None:
            detail[field_name_new] = value_filtered
    
    return detail

## CRAWL FUNCTION

In [25]:
def crawl_tiki_laptop(output: str):
    url = "https://tiki.vn/laptop/c8095"
    session = HTMLSession()
    products = list()
    page = 1
    total = 1

    while True:
        response = session.get(url, params={'page': page}, allow_redirects=True)
        if not response.ok:
            break;

        total_text = response.html.find('.product-box [name=results-count]', first=True).text
        total_text_number = re.findall(r'\d+', total_text)
        if (len(total_text_number)):
            total = int(total_text_number[0])

        products += get_products_tiki(response.html)
        print(f"[TIKI] Crawled page {page}")
        page += 1
        if (len(products) >= total):
            break;

    df = pd.DataFrame(products, columns=['title', 'brand', 'price', 'link'])
    df.to_csv(output)
    
def crawl_tiki_laptop_detail(inp: str, out: str, limit: int = None):
    session = HTMLSession()
    products = pd.read_csv(inp)
    products_detail = list()
    
    try:
        for idx, row in products.iterrows():
            row: dict = row
            link = row['link']
            resp = session.get(link)
            detail = get_products_detail_tiki(resp.html)
            row.pop('link')
            
            products_detail.append({
                **row,
                **detail
            })
            
            print(f"Crawled detail {idx}: {row['title']}")
            if (limit is not None and idx >= limit):
                break
    except Exception as e:
        print('Error: ' + e)

    df = pd.DataFrame(products_detail)
    df.to_csv(out, index=False)
    print('DONE')
    

## BEGIN CRAWL

In [10]:
crawl_tiki_laptop('tiki_meta.csv')

[TIKI] Crawled page 1
[TIKI] Crawled page 2
[TIKI] Crawled page 3
[TIKI] Crawled page 4
[TIKI] Crawled page 5
[TIKI] Crawled page 6
[TIKI] Crawled page 7
[TIKI] Crawled page 8
[TIKI] Crawled page 9
[TIKI] Crawled page 10
[TIKI] Crawled page 11
[TIKI] Crawled page 12
[TIKI] Crawled page 13
[TIKI] Crawled page 14
[TIKI] Crawled page 15
[TIKI] Crawled page 16
[TIKI] Crawled page 17
[TIKI] Crawled page 18
[TIKI] Crawled page 19
[TIKI] Crawled page 20
[TIKI] Crawled page 21
[TIKI] Crawled page 22
[TIKI] Crawled page 23
[TIKI] Crawled page 24
[TIKI] Crawled page 25
[TIKI] Crawled page 26


In [30]:
crawl_tiki_laptop_detail('tiki_meta.csv', 'tiki_detail.csv')

Crawled detail 0: Laptop Asus Vivobook S15 S530UA-BQ034T Core i3-8130U/Win10 (15.6 inch FHD IPS) - Hàng Chính Hãng
Crawled detail 1: Asus F570ZD FY415T AMD R5-2500U/ VGA GTX1050/ Win10 (15.6 FHD) - Hàng Chính Hãng
Crawled detail 2: Laptop Asus ASUSPRO B9440UA-GV0495T Core i5-8250U/ Win10 (14
Crawled detail 3: Dell Inspiron 3480-N4I5107W: i5-8265U | 4GB RAM | 1TB HDD | UHD Graphics 620 | 14.0 HD | Win10 | Silver - Hàng Chính Hãng
Crawled detail 4: Apple Macbook Air 2019 - 13 inchs (i5/ 8GB/ 128GB) - Hàng Nhập Khẩu Chính Hãng
Crawled detail 5: Laptop Asus Vivobook 14 A412FA-EK224T Core i5-8265U/ Win10 (14 FHD) - Hàng Chính Hãng
Crawled detail 6: Laptop Lenovo Ideapad 130-15AST 81H5000VVN AMD A9-9425/ Win10 (15.6
Crawled detail 7: Macbook Air 2017 MQD32 (13.3 inch) - Hàng Chính Hãng
Crawled detail 8: Laptop HP ENVY 13-AQ0026TU (Intel Core I5-8265U  8GB RAM DDR4  256GB SSD  13,3
Crawled detail 9: Apple Macbook Pro Touch Bar 2019 - 13 inchs (i5/ 8GB/ 128GB) - Hàng Nhập Khẩu Chính Hãng
Crawl