In [2]:
from requests_html import HTMLSession
import pandas as pd
import re

## GET PRODUCT'S META INFO FUNCTION

In [3]:
def get_products_tiki(html) -> list:
    products_elements = html.find('.product-item')
    product_list = list()

    for product in products_elements:
        title = product.attrs['data-title']
        price = product.attrs['data-price']
        brand = product.attrs['data-brand']
        link = product.find('a', first=True).attrs['href']
        product_list.append({
            'title': title.strip('\t'),
            'brand': brand,
            'price': price,
            'link': link,
        })

    return product_list

def get_products_detail_tiki(html) -> dict:
    detail = {}
    chi_tiet = html.find('#chi-tiet tbody tr')
    for info in chi_tiet:
        field_name = info.find('td', first=True).attrs['rel']
        value = info.find('td.last', first=True).text.trim()
        detail[field_name] = value
    
    return detail

## CRAWL FUNCTION

In [9]:
def crawl_tiki_laptop(output: str):
    url = "https://tiki.vn/laptop/c8095"
    session = HTMLSession()
    products = list()
    page = 1
    total = 1

    while True:
        response = session.get(url, params={'page': page}, allow_redirects=True)
        if not response.ok:
            break;

        total_text = response.html.find('.product-box [name=results-count]', first=True).text
        total_text_number = re.findall(r'\d+', total_text)
        if (len(total_text_number)):
            total = int(total_text_number[0])

        products += get_products_tiki(response.html)
        print(f"[TIKI] Crawled page {page}")
        page += 1
        if (len(products) >= total):
            break;

    df = pd.DataFrame(products, columns=['title', 'brand', 'price', 'link'])
    df.to_csv(output)
    
def crawl_tiki_laptop_detail(inp: str, out: str):
    session = HTMLSession()
    products = pd.read_csv(inp)
    
    

## BEGIN CRAWL

In [10]:
crawl_tiki_laptop('tiki_meta.csv')

[TIKI] Crawled page 1
[TIKI] Crawled page 2
[TIKI] Crawled page 3
[TIKI] Crawled page 4
[TIKI] Crawled page 5
[TIKI] Crawled page 6
[TIKI] Crawled page 7
[TIKI] Crawled page 8
[TIKI] Crawled page 9
[TIKI] Crawled page 10
[TIKI] Crawled page 11
[TIKI] Crawled page 12
[TIKI] Crawled page 13
[TIKI] Crawled page 14
[TIKI] Crawled page 15
[TIKI] Crawled page 16
[TIKI] Crawled page 17
[TIKI] Crawled page 18
[TIKI] Crawled page 19
[TIKI] Crawled page 20
[TIKI] Crawled page 21
[TIKI] Crawled page 22
[TIKI] Crawled page 23
[TIKI] Crawled page 24
[TIKI] Crawled page 25
[TIKI] Crawled page 26
